forked from wylab/llama.cpp
Compare commits
71 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 89fea80d29 | |||
| 6adc3c3ebc | |||
| 0dbcabde8c | |||
| ad590be98c | |||
| 7d6d91babf | |||
| d3e64b9f49 | |||
| 3ba0d843c6 | |||
| 0bf49eb668 | |||
| 4ad243677b | |||
| c89c2d1ab9 | |||
| 3555b3004b | |||
| d7da8dc83a | |||
| cd355eda7d | |||
| 30e5b01de2 | |||
| e54b394082 | |||
| 2c2caa4443 | |||
| 5fce5f948d | |||
| 9ae4143bc6 | |||
| c311ac664d | |||
| b9912ac570 | |||
| 00ba772610 | |||
| 3cb203c89f | |||
| 2e42be42bd | |||
| fb85a288d7 | |||
| 40643edb86 | |||
| 3cfbbdb44e | |||
| 80709b70a2 | |||
| 26ff3685bf | |||
| 60c666347b | |||
| b7cc7745e3 | |||
| cc8d081879 | |||
| d714dadb57 | |||
| ffad043973 | |||
| 0889eba570 | |||
| c61285e739 | |||
| 09cf2c7c65 | |||
| c33fe8b8c4 | |||
| ed52f3668e | |||
| a681b4ba83 | |||
| 7d516443dd | |||
| f6e1a7aa87 | |||
| c3ee46fab4 | |||
| e2c0b6e46a | |||
| 9596506965 | |||
| a20b2b05bc | |||
| 2e89f76b7a | |||
| 532802f938 | |||
| d4e0d95cf5 | |||
| cc66a7f78f | |||
| bd248d4dc7 | |||
| 7781e5fe99 | |||
| 89a184fa71 | |||
| 2baf07727f | |||
| 7ae2932116 | |||
| 1f7d50b293 | |||
| 4c763c8d1b | |||
| dad5c44398 | |||
| 55f6b9fa65 | |||
| 3678b838bb | |||
| 652b70e667 | |||
| 3a12db23b6 | |||
| ae92c1855b | |||
| b7ce1ad1e3 | |||
| 97340b4c99 | |||
| 2bb0467043 | |||
| b8e2194efc | |||
| 1a3b5e80f7 | |||
| 1f63e75f3b | |||
| 40cbf571c9 | |||
| 7f4fbe5183 | |||
| f470bc36be |
+16
-12
@@ -49,19 +49,23 @@ COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-venv && \
|
||||
python3 -m venv /opt/venv && \
|
||||
. /opt/venv/bin/activate && \
|
||||
pip install --upgrade pip setuptools wheel && \
|
||||
pip install -r requirements.txt && \
|
||||
apt autoremove -y && \
|
||||
apt clean -y && \
|
||||
rm -rf /tmp/* /var/tmp/* && \
|
||||
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||
find /var/cache -type f -delete
|
||||
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
|
||||
@@ -306,6 +306,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
export GGML_VK_VISIBLE_DEVICES=0
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 3600
|
||||
|
||||
@@ -687,12 +688,12 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
- build: 'cpu-x64 (static)'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
@@ -777,6 +778,7 @@ jobs:
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
|
||||
+10
-4
@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||
|
||||
if (NOT DEFINED LLAMA_BUILD_NUMBER)
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
endif()
|
||||
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
endif()
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
# override ggml options
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
@@ -155,6 +163,8 @@ if (LLAMA_USE_SYSTEM_GGML)
|
||||
endif()
|
||||
|
||||
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
|
||||
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
@@ -204,10 +214,6 @@ endif()
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
|
||||
endif
|
||||
|
||||
ifndef GGML_NO_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
|
||||
endif
|
||||
|
||||
# warnings
|
||||
@@ -970,7 +970,7 @@ OBJ_GGML = \
|
||||
$(DIR_GGML)/src/ggml-threading.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/repack.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
@@ -18,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
## Hot topics
|
||||
|
||||
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
|
||||
@@ -39,7 +39,7 @@ sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||
|
||||
+10
-20
@@ -7,8 +7,8 @@ llama_add_compile_flags()
|
||||
# Build info header
|
||||
#
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if (SLASH_POS EQUAL 0)
|
||||
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||
else()
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(EXISTS "${GIT_DIR}/index")
|
||||
set(GIT_INDEX "${GIT_DIR}/index")
|
||||
# For build-info.cpp below
|
||||
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
|
||||
else()
|
||||
message(WARNING "Git index not found in git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
|
||||
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
)
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT build-info.cpp)
|
||||
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
+3
-6
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (params.reranking && params.embedding) {
|
||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s%s\n",
|
||||
@@ -2747,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||
add_opt(common_arg(
|
||||
{"--reranking", "--rerank"},
|
||||
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
||||
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||
[](common_params & params) {
|
||||
params.reranking = true;
|
||||
params.embedding = true;
|
||||
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
|
||||
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
||||
|
||||
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||
result_.tool_calls.emplace_back(tool_call);
|
||||
|
||||
return true;
|
||||
}
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
||||
/* .is_partial = */ found_healing_marker,
|
||||
};
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::clear_tools() {
|
||||
result_.tool_calls.clear();
|
||||
}
|
||||
|
||||
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
|
||||
void clear_tools();
|
||||
};
|
||||
|
||||
+3
-1
@@ -1921,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||
} catch (const common_chat_msg_partial_exception & ex) {
|
||||
LOG_DBG("Partial parse: %s\n", ex.what());
|
||||
if (!is_partial) {
|
||||
throw std::runtime_error(ex.what());
|
||||
builder.clear_tools();
|
||||
builder.move_to(0);
|
||||
common_chat_parse_content_only(builder);
|
||||
}
|
||||
}
|
||||
auto msg = builder.result();
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||
|
||||
# Only write the build info if it changed
|
||||
if(EXISTS ${OUTPUT_FILE})
|
||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
+42
-38
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$0");
|
||||
return std::regex_replace(s, special_chars, "\\$&");
|
||||
}
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||
@@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
return true;
|
||||
}
|
||||
|
||||
#include <iostream>
|
||||
|
||||
|
||||
// returns true if successful, false otherwise
|
||||
bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
@@ -784,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
||||
const std::wstring subpath = wpath.substr(0, pos_slash);
|
||||
const wchar_t * test = subpath.c_str();
|
||||
|
||||
const bool success = CreateDirectoryW(test, NULL);
|
||||
pos_slash += 1;
|
||||
|
||||
// skip the drive letter, in some systems it can return an access denied error
|
||||
if (subpath.length() == 2 && subpath[1] == ':') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
||||
|
||||
if (!success) {
|
||||
const DWORD error = GetLastError();
|
||||
|
||||
@@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -897,34 +905,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
if (params.reranking) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
@@ -966,6 +946,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
@@ -1143,11 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
cparams.swa_full = params.swa_full;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
|
||||
cparams.type_k = params.cache_type_k;
|
||||
cparams.type_v = params.cache_type_v;
|
||||
|
||||
|
||||
@@ -355,7 +355,6 @@ struct common_params {
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||
std::string embd_sep = "\n"; // separator of embeddings
|
||||
bool reranking = false; // enable reranking support on server
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
|
||||
+76
-30
@@ -519,7 +519,7 @@ class TextModel(ModelBase):
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
||||
self.gguf_writer.add_context_length(n_ctx)
|
||||
logger.info(f"gguf: context length = {n_ctx}")
|
||||
|
||||
@@ -556,8 +556,11 @@ class TextModel(ModelBase):
|
||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||
|
||||
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||
self.gguf_writer.add_key_length(head_dim)
|
||||
self.gguf_writer.add_value_length(head_dim)
|
||||
# Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
|
||||
# https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
|
||||
if self.hparams.get("model_type") != "deepseek_v3":
|
||||
self.gguf_writer.add_key_length(head_dim)
|
||||
self.gguf_writer.add_value_length(head_dim)
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
logger.info(f"gguf: file type = {self.ftype}")
|
||||
@@ -2017,6 +2020,20 @@ class LlamaModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("ArceeForCausalLM")
|
||||
class ArceeModel(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.ARCEE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"LlavaForConditionalGeneration", # pixtral
|
||||
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
||||
@@ -4059,6 +4076,34 @@ class NomicBertModel(BertModel):
|
||||
raise ValueError(f"unknown tokenizer: {toktyp}")
|
||||
|
||||
|
||||
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
|
||||
class NeoBert(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.NEO_BERT
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# NeoBERT uses 2/3 of the intermediate size as feed forward length
|
||||
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
|
||||
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
|
||||
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||
|
||||
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
if name.startswith("decoder."):
|
||||
return []
|
||||
|
||||
if name.startswith("model."):
|
||||
name = name[6:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
@@ -4798,25 +4843,6 @@ class OlmoeModel(TextModel):
|
||||
class JinaBertV2Model(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.intermediate_size = self.hparams["intermediate_size"]
|
||||
|
||||
def get_tensors(self):
|
||||
for name, data in super().get_tensors():
|
||||
if 'gated_layer' in name:
|
||||
d1 = data[:self.intermediate_size, :]
|
||||
name1 = name.replace('gated_layers', 'gated_layers_w')
|
||||
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
||||
d2 = data[self.intermediate_size:, :]
|
||||
name2 = name.replace('gated_layers', 'gated_layers_v')
|
||||
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
||||
yield name1, d1
|
||||
yield name2, d2
|
||||
continue
|
||||
|
||||
yield name, data
|
||||
|
||||
def set_vocab(self):
|
||||
tokenizer_class = 'BertTokenizer'
|
||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
@@ -4832,14 +4858,6 @@ class JinaBertV2Model(BertModel):
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "bert.", remove the prefix
|
||||
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
if name.startswith("bert."):
|
||||
name = name[5:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("OpenELMForCausalLM")
|
||||
class OpenELMModel(TextModel):
|
||||
@@ -5286,6 +5304,34 @@ class DeepseekV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Dots1ForCausalLM")
|
||||
class Dots1Model(Qwen2MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.DOTS1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||
|
||||
if self.hparams["scoring_func"] == "noaux_tc":
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
else:
|
||||
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
if name.endswith("e_score_correction_bias"):
|
||||
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||
if "shared_experts" in name:
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("PLMForCausalLM")
|
||||
class PLMModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PLM
|
||||
|
||||
@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
|
||||
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
|
||||
- Functionary v3.1 / v3.2
|
||||
- Hermes 2/3, Qwen 2.5
|
||||
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
|
||||
- Qwen 2.5 Coder
|
||||
- Mistral Nemo
|
||||
- Firefunction v2
|
||||
- Command R7B
|
||||
|
||||
@@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||
```
|
||||
|
||||
## Finding more models:
|
||||
|
||||
GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf
|
||||
|
||||
@@ -41,12 +41,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
|
||||
// add input to batch (this increments n_tokens)
|
||||
for (int32_t j = 0; j < n_toks; j++) {
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, true);
|
||||
}
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_embeddings(ctx, true);
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
// run model
|
||||
@@ -103,7 +102,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_embeddings(ctx, false);
|
||||
llama_set_causal_attn(ctx, true);
|
||||
|
||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
@@ -166,6 +164,8 @@ int main(int argc, char * argv[]) {
|
||||
llama_model_params mparams = common_model_params_to_llama(params);
|
||||
llama_context_params cparams = common_context_params_to_llama(params);
|
||||
|
||||
cparams.embeddings = true;
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
@@ -213,6 +213,8 @@ int main(int argc, char * argv[]) {
|
||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
|
||||
}
|
||||
|
||||
llama_set_embeddings(ctx, false);
|
||||
|
||||
// ### Generation ###
|
||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||
{
|
||||
|
||||
+2
-1
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
||||
message(DEBUG "INS_ENB : ${INS_ENB}")
|
||||
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
||||
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
@@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
|
||||
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
|
||||
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
||||
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
||||
|
||||
+42
-11
@@ -212,6 +212,7 @@ endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-reg.cpp)
|
||||
add_library(ggml::ggml ALIAS ggml)
|
||||
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
@@ -269,17 +270,23 @@ endfunction()
|
||||
function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
endfunction()
|
||||
@@ -289,6 +296,8 @@ ggml_add_backend(CPU)
|
||||
if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
@@ -302,8 +311,30 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Many of these features are optional so we build versions with popular
|
||||
# combinations and name the backends based on the version they were
|
||||
# first released with
|
||||
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
# Android-specific backends with SoC-compatible feature sets
|
||||
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
||||
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
||||
GGML_TABLE_END()
|
||||
|
||||
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
||||
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
||||
GGML_TABLE_END()
|
||||
|
||||
#define NGRID_IQ1S 2048
|
||||
#define IQ1S_DELTA 0.125f
|
||||
#define IQ1M_DELTA 0.125f
|
||||
|
||||
@@ -1,3 +1,17 @@
|
||||
function(ggml_add_cpu_backend_features cpu_name arch)
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (tag_name)
|
||||
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
||||
@@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list (APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/ggml-cpu.c
|
||||
ggml-cpu/ggml-cpu.cpp
|
||||
ggml-cpu/ggml-cpu-aarch64.cpp
|
||||
ggml-cpu/ggml-cpu-aarch64.h
|
||||
ggml-cpu/ggml-cpu-hbm.cpp
|
||||
ggml-cpu/ggml-cpu-hbm.h
|
||||
ggml-cpu/ggml-cpu-quants.c
|
||||
ggml-cpu/ggml-cpu-quants.h
|
||||
ggml-cpu/ggml-cpu-traits.cpp
|
||||
ggml-cpu/ggml-cpu-traits.h
|
||||
ggml-cpu/repack.cpp
|
||||
ggml-cpu/repack.h
|
||||
ggml-cpu/hbm.cpp
|
||||
ggml-cpu/hbm.h
|
||||
ggml-cpu/quants.c
|
||||
ggml-cpu/quants.h
|
||||
ggml-cpu/traits.cpp
|
||||
ggml-cpu/traits.h
|
||||
ggml-cpu/amx/amx.cpp
|
||||
ggml-cpu/amx/amx.h
|
||||
ggml-cpu/amx/mmq.cpp
|
||||
@@ -84,6 +98,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
message(STATUS "ARM detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/arm/quants.c
|
||||
ggml-cpu/arch/arm/repack.cpp
|
||||
)
|
||||
|
||||
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
|
||||
else()
|
||||
@@ -138,6 +157,46 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
else()
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
# Begin with the lowest baseline
|
||||
set(ARM_MCPU "armv8-a")
|
||||
set(ARCH_TAGS "")
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
if (GGML_INTERNAL_DOTPROD)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||
endif()
|
||||
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||
endif()
|
||||
if (GGML_INTERNAL_MATMUL_INT8)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE2)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SME)
|
||||
set(ARM_MCPU "armv9.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -167,6 +226,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
message(STATUS "x86 detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/x86/quants.c
|
||||
ggml-cpu/arch/x86/repack.cpp
|
||||
)
|
||||
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
@@ -296,21 +360,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
message(STATUS "PowerPC detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
|
||||
if (GGML_NATIVE)
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
file(READ "/proc/cpuinfo" POWER10_M)
|
||||
@@ -338,6 +392,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
|
||||
|
||||
list(APPEND ARCH_FLAGS -march=loongarch64)
|
||||
if (GGML_LASX)
|
||||
list(APPEND ARCH_FLAGS -mlasx)
|
||||
@@ -347,6 +403,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
message(STATUS "riscv64 detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/riscv/quants.c
|
||||
ggml-cpu/arch/riscv/repack.cpp
|
||||
)
|
||||
if (GGML_RVV)
|
||||
if (GGML_XTHEADVECTOR)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
||||
@@ -358,6 +418,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
message(STATUS "s390x detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
||||
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
||||
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
||||
|
||||
@@ -381,12 +442,16 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_VXE)
|
||||
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
||||
endif()
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
||||
message(STATUS "Wasm detected")
|
||||
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
|
||||
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_AARCH64)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
|
||||
if (GGML_CPU_REPACK)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_KLEIDIAI)
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
#include <sys/syscall.h>
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include "mmq.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu-quants.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-quants.h"
|
||||
#include <algorithm>
|
||||
#include <type_traits>
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
#pragma once
|
||||
|
||||
// Solve alias issue for Apple targets (currently PowerPC, x86, and ARM64).
|
||||
// Mach-O has a weak alias equivalent but no practical compiler support can
|
||||
// be found, so we need to do it manually.
|
||||
// ref: https://stackoverflow.com/questions/42757744
|
||||
//
|
||||
// This file is a complement to native implementations in the `arch` folder.
|
||||
// A kernel in quants.c or repack.cpp is either:
|
||||
// - implemented in the `arch` folder, or
|
||||
// - defined in this file to remove the `_generic` suffix
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
// quants.c
|
||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__POWERPC__)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#endif
|
||||
@@ -0,0 +1,94 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/auxv.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_I8MM)
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_SME)
|
||||
#define HWCAP2_SME (1 << 23)
|
||||
#endif
|
||||
|
||||
struct aarch64_features {
|
||||
// has_neon not needed, aarch64 has NEON guaranteed
|
||||
bool has_dotprod = false;
|
||||
bool has_fp16_va = false;
|
||||
bool has_sve = false;
|
||||
bool has_sve2 = false;
|
||||
bool has_i8mm = false;
|
||||
bool has_sme = false;
|
||||
|
||||
aarch64_features() {
|
||||
#if defined(__linux__)
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
||||
has_sve = !!(hwcap & HWCAP_SVE);
|
||||
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
||||
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||
has_sme = !!(hwcap2 & HWCAP2_SME);
|
||||
#elif defined(__APPLE__)
|
||||
int oldp = 0;
|
||||
size_t size = sizeof(oldp);
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
||||
has_dotprod = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
||||
has_i8mm = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
||||
has_sme = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
// Apple apparently does not implement SVE yet
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_aarch64_score() {
|
||||
int score = 1;
|
||||
aarch64_features af;
|
||||
|
||||
#ifdef GGML_USE_DOTPROD
|
||||
if (!af.has_dotprod) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
|
||||
if (!af.has_fp16_va) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE
|
||||
if (!af.has_sve) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_USE_MATMUL_INT8
|
||||
if (!af.has_i8mm) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE2
|
||||
if (!af.has_sve2) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
#ifdef GGML_USE_SME
|
||||
if (!af.has_sme) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
|
||||
|
||||
# endif // defined(__aarch64__)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,396 @@
|
||||
#define GGML_COMMON_IMPL_CPP
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdlib> // for qsort
|
||||
#include <cstdio> // for GGML_ASSERT
|
||||
|
||||
#define GGML_CPU_CLANG_WORKAROUND
|
||||
#include "../../repack.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
for (int l = 0; l < nb; l++) {
|
||||
const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
|
||||
const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
|
||||
const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
|
||||
const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment constraints
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
|
||||
|
||||
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
||||
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
||||
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
||||
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
||||
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
||||
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
||||
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
||||
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
const float b_scales[8] = {
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
|
||||
sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
__riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nr % 4 == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
for (int l = 0; l < nb; l++) {
|
||||
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
||||
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
||||
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
||||
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
||||
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
||||
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
||||
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scales[4] = {
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[3])
|
||||
};
|
||||
const float b_scales[8] = {
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
|
||||
const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
|
||||
const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
|
||||
const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
|
||||
const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l0;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l0 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
|
||||
sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
|
||||
const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
|
||||
const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
|
||||
const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l1;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l1 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
|
||||
sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
|
||||
const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
|
||||
const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
|
||||
const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l2;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l2 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
|
||||
sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
|
||||
const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
|
||||
const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
|
||||
const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l3;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l3 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
|
||||
sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
}
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
||||
@@ -503,6 +503,34 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
||||
// TODO: move to ggml-threading
|
||||
void ggml_barrier(struct ggml_threadpool * tp);
|
||||
|
||||
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
|
||||
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#define GGML_DO_PRAGMA_(x) _Pragma (#x)
|
||||
#define GGML_DO_PRAGMA(x) GGML_DO_PRAGMA_(x)
|
||||
#if defined(GGML_CPU_GENERIC) || defined(__HIPCC__) || defined(__APPLE__)
|
||||
// Note for Apple targets:
|
||||
// - clang: aliases are not supported on darwin
|
||||
// - all native kernels need to be implemented in both x86 and arm files
|
||||
// - on iOS, tvOS, and visionOS, if cmake cannot determine the target architecture, all `_generic` names are replaced by defines
|
||||
# define GGML_WEAK_ALIAS(name, alias)
|
||||
#elif defined(__GNUC__)
|
||||
// GCC/Clang on *nix
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(weak name = alias) // NOLINT
|
||||
#elif defined(_MSC_VER) && defined(_WIN64)
|
||||
// MSVC
|
||||
// Note: C name mangling varies across different calling conventions
|
||||
// see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
|
||||
#elif defined(_MSC_VER) && defined(WIN32)
|
||||
// ref: https://github.com/ggml-org/whisper.cpp/pull/3239#issuecomment-2958224591
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:_" #name "=_" #alias))
|
||||
#else
|
||||
# error "Unsupported compiler for GGML_WEAK_ALIAS"
|
||||
#endif
|
||||
|
||||
#define GGML_CPU_NATIVE_IMPL(name) GGML_WEAK_ALIAS(name, name ## _generic)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,11 +3,11 @@
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-quants.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "unary-ops.h"
|
||||
#include "binary-ops.h"
|
||||
@@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
|
||||
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||
}
|
||||
|
||||
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
|
||||
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||
}
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||
cpu_set_t cpuset;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "repack.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "amx/amx.h"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include <vector>
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
# include "ggml-cpu-hbm.h"
|
||||
# include "hbm.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
@@ -51,9 +51,9 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_AARCH64
|
||||
if (ggml_backend_cpu_aarch64_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
|
||||
#ifdef GGML_USE_CPU_REPACK
|
||||
if (ggml_backend_cpu_repack_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_repack_buffer_type());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -596,8 +596,8 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
features.push_back({ "KLEIDIAI", "1" });
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU_AARCH64
|
||||
features.push_back({ "AARCH64_REPACK", "1" });
|
||||
#ifdef GGML_USE_CPU_REPACK
|
||||
features.push_back({ "REPACK", "1" });
|
||||
#endif
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include "ggml-cpu-hbm.h"
|
||||
#include "hbm.h"
|
||||
|
||||
// buffer type HBM
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include "kernels.h"
|
||||
|
||||
|
||||
@@ -53,7 +53,6 @@
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
@@ -394,8 +393,6 @@ class tinyBLAS {
|
||||
|
||||
template <int RM, int RN, int BM>
|
||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||
static std::atomic<int64_t> current_chunk;
|
||||
|
||||
GGML_ASSERT(m % (RM * BM) == 0);
|
||||
const int64_t ytiles = m / (RM * BM);
|
||||
const int64_t xtiles = (n + RN -1) / RN;
|
||||
@@ -410,7 +407,7 @@ class tinyBLAS {
|
||||
if (params->ith == 0) {
|
||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
||||
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
@@ -439,8 +436,7 @@ class tinyBLAS {
|
||||
GGML_ASSERT(jj == jj2);
|
||||
}
|
||||
|
||||
// next step.
|
||||
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
||||
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -58,6 +58,32 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
// Generic implementation
|
||||
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,103 @@
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "traits.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
|
||||
|
||||
template <int K> constexpr int QK_0() {
|
||||
if constexpr (K == 4) {
|
||||
return QK4_0;
|
||||
}
|
||||
if constexpr (K == 8) {
|
||||
return QK8_0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <int K, int N> struct block {
|
||||
ggml_half d[N]; // deltas for N qK_0 blocks
|
||||
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
||||
};
|
||||
|
||||
// control size
|
||||
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
||||
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
||||
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
||||
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
||||
|
||||
using block_q4_0x4 = block<4, 4>;
|
||||
using block_q4_0x8 = block<4, 8>;
|
||||
using block_q8_0x4 = block<8, 4>;
|
||||
using block_q8_0x8 = block<8, 8>;
|
||||
|
||||
struct block_q4_Kx8 {
|
||||
ggml_half d[8]; // super-block scale for quantized scales
|
||||
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||
uint8_t scales[96]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[1024]; // 4--bit quants
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||
|
||||
struct block_q8_Kx4 {
|
||||
float d[4]; // delta
|
||||
int8_t qs[QK_K * 4]; // quants
|
||||
int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
struct block_iq4_nlx4 {
|
||||
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Workaround for clang:
|
||||
// clang++ complains: ``error: call to 'ggml_gemm_q4_0_4x4_q8_0' is ambiguous''
|
||||
// repro: https://godbolt.org/z/oKdeWKonM (ICE), https://godbolt.org/z/1szq6P36v (ambiguous call)
|
||||
#if defined(GGML_CPU_CLANG_WORKAROUND) || defined(__APPLE__) || !(defined(__GNUC__) && defined(__clang__)) || defined(__HIPCC__)
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
#endif // !defined(__clang__)
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
@@ -207,9 +207,9 @@ typedef float2 dfloat2;
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
@@ -262,11 +262,11 @@ static bool cp_async_available(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
return __AMDGCN_WAVEFRONT_SIZE;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
return 64;
|
||||
#else
|
||||
return 32;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
@@ -466,9 +466,6 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
// TODO: move to ggml-common.h
|
||||
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
static __device__ __forceinline__ float get_alibi_slope(
|
||||
|
||||
@@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
float * __restrict__ dst, const int64_t L) {
|
||||
GGML_UNUSED(src1_nb0);
|
||||
GGML_UNUSED(src2_nb0);
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
const int bidx = blockIdx.x; // split along B
|
||||
const int bidy = blockIdx.y; // split along D
|
||||
const int tid = threadIdx.x;
|
||||
@@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
if (N == 16) {
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
|
||||
float value = A_block[(wid * warp_size + i) * stride_A + wtid];
|
||||
// todo: bank conflict
|
||||
// I am always confused with how to use the swizzling method to solve
|
||||
// bank conflit. Hoping somebody can tell me.
|
||||
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_FA)
|
||||
add_compile_definitions(GGML_CUDA_NO_FA)
|
||||
endif()
|
||||
|
||||
@@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY)
|
||||
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${METALLIB_EMBED_ASM}
|
||||
OUTPUT "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "Embedding Metal library"
|
||||
COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP}
|
||||
COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}"
|
||||
COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}"
|
||||
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
|
||||
target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
|
||||
else()
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
|
||||
@@ -3333,8 +3333,6 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*DK); // same as above but in o4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
|
||||
|
||||
threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
|
||||
@@ -3548,20 +3546,20 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
// O = diag(ms)*O
|
||||
{
|
||||
s8x8_t mm;
|
||||
simdgroup_load(mm, ss + 2*C, TS, 0, false);
|
||||
s8x8_t ms;
|
||||
simdgroup_load(ms, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_multiply(lo[i], mm, lo[i]);
|
||||
simdgroup_multiply(lo[i], ms, lo[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// O = O + (Q*K^T)*V
|
||||
{
|
||||
for (short cc = 0; cc < C/8; ++cc) {
|
||||
s8x8_t ms;
|
||||
simdgroup_load(ms, ss + 8*cc, TS, 0, false);
|
||||
s8x8_t vs;
|
||||
simdgroup_load(vs, ss + 8*cc, TS, 0, false);
|
||||
|
||||
if (is_same<vd4x4_t, v4x4_t>::value) {
|
||||
// we can read directly from global memory
|
||||
@@ -3572,7 +3570,7 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
|
||||
simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]);
|
||||
}
|
||||
} else {
|
||||
for (short ii = 0; ii < DV16; ii += 4) {
|
||||
@@ -3593,10 +3591,10 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||
}
|
||||
} else {
|
||||
if (ii + tx < DV16) {
|
||||
@@ -3611,10 +3609,10 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3624,83 +3622,80 @@ kernel void kernel_flash_attn_ext(
|
||||
}
|
||||
|
||||
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
if (tiisg == 0) {
|
||||
ss[j*TS + 0] = S[j];
|
||||
ss[j*TS + 1] = M[j];
|
||||
}
|
||||
for (short j = tiisg; j < Q; j += NW) {
|
||||
ss[j*TS + 0] = S[j];
|
||||
ss[j*TS + 1] = M[j];
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup float * so = (threadgroup float *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||
threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK);
|
||||
|
||||
// store result to shared memory in F32
|
||||
if (sgitg == 0) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
//simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
simdgroup_float8x8 t(1.0f);
|
||||
simdgroup_multiply(t, lo[i], t);
|
||||
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// reduce the warps sequentially
|
||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// each simdgroup stores its output to shared memory, reusing sq
|
||||
if (sgitg == sg) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
for (short j = tiisg; j < Q; j += NW) {
|
||||
const float S0 = ss[j*TS - 1*SH + 0];
|
||||
const float S1 = ss[j*TS + 0];
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// the first simdgroup accumulates the results from the other simdgroups
|
||||
if (sgitg == 0) {
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
const float S0 = ss[j*TS + 0];
|
||||
const float S1 = ss[j*TS + sg*SH + 0];
|
||||
|
||||
const float M0 = ss[j*TS + 1];
|
||||
const float M1 = ss[j*TS + sg*SH + 1];
|
||||
const float M0 = ss[j*TS - 1*SH + 1];
|
||||
const float M1 = ss[j*TS + 1];
|
||||
|
||||
const float M = max(M0, M1);
|
||||
|
||||
const float ms0 = exp(M0 - M);
|
||||
const float ms1 = exp(M1 - M);
|
||||
float ms0 = exp(M0 - M);
|
||||
float ms1 = exp(M1 - M);
|
||||
|
||||
const float S = S0*ms0 + S1*ms1;
|
||||
|
||||
if (tiisg == 0) {
|
||||
ss[j*TS + 0] = S;
|
||||
ss[j*TS + 1] = M;
|
||||
ss[j*TS + 0] = S;
|
||||
ss[j*TS + 1] = M;
|
||||
|
||||
ss[j*TS + 2*C + j ] = ms0;
|
||||
ss[j*TS + 2*C + j + sg*SH] = ms1;
|
||||
}
|
||||
ss[j*TS + 2*C + j - 1*SH] = ms0;
|
||||
ss[j*TS + 2*C + j ] = ms1;
|
||||
}
|
||||
|
||||
//simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
||||
{
|
||||
s8x8_t ms0;
|
||||
s8x8_t ms1;
|
||||
|
||||
simdgroup_load(ms0, ss + 2*C, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
|
||||
simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
o8x8_t t;
|
||||
simdgroup_float8x8 t;
|
||||
|
||||
simdgroup_load (t, so + i*8, DV, 0, false);
|
||||
simdgroup_multiply(t, ms1, t);
|
||||
simdgroup_multiply(t, ms0, t);
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
|
||||
simdgroup_multiply_accumulate(t, ms1, lo[i], t);
|
||||
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
// store result to shared memory (reuse sq)
|
||||
if (sgitg == 0) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*Q*DK);
|
||||
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
|
||||
|
||||
// final rescale with 1/S and store to global memory
|
||||
for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
|
||||
@@ -3723,8 +3718,8 @@ kernel void kernel_flash_attn_ext(
|
||||
half, half4x4, simdgroup_half8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, float4, simdgroup_float8x8
|
||||
//half, half4, simdgroup_half8x8
|
||||
half, half4, simdgroup_half8x8
|
||||
//float, float4, simdgroup_float8x8
|
||||
|
||||
#define FA_TYPES_BF \
|
||||
bfloat, bfloat4, simdgroup_bfloat8x8, \
|
||||
@@ -3732,8 +3727,8 @@ kernel void kernel_flash_attn_ext(
|
||||
bfloat, bfloat4x4, simdgroup_bfloat8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, float4, simdgroup_float8x8
|
||||
//half, half4, simdgroup_half8x8
|
||||
half, half4, simdgroup_half8x8
|
||||
//float, float4, simdgroup_float8x8
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
|
||||
|
||||
|
||||
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_q4_0_f32_1d_8x_flat
|
||||
mul_mv_q4_0_f32_1d_16x_flat
|
||||
mul_mv_q6_k
|
||||
mul_mv_id_q4_0_f32_8x_flat
|
||||
mul
|
||||
norm
|
||||
relu
|
||||
|
||||
@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_upscale;
|
||||
cl_program program_concat;
|
||||
cl_program program_tsembd;
|
||||
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_concat_f32_contiguous;
|
||||
cl_kernel kernel_concat_f32_non_contiguous;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// repeat
|
||||
// repeat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
}
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// Adreno kernels
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// transpose
|
||||
@@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
@@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
GGML_ASSERT(src2);
|
||||
GGML_ASSERT(src2->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
#endif
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
|
||||
const int ne20 = src2->ne[0];
|
||||
const int ne21 = src2->ne[1];
|
||||
|
||||
const cl_ulong nb21 = src2->nb[1];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
const int r2 = ne12/ne02;
|
||||
const int r3 = ne13/ne03;
|
||||
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
int sgs = 32; // subgroup size
|
||||
int nsg = 1; // number of subgroups
|
||||
int nrows = 1; // number of row in src1
|
||||
int ndst = 4; // number of values produced by each subgroup
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
// subgroup mat vec
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0: {
|
||||
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 16;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false && "not implemented");;
|
||||
}
|
||||
|
||||
int _ne1 = 1;
|
||||
int ne123 = dst_rows;
|
||||
|
||||
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
||||
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_mul_mat;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_mul_mat_id;
|
||||
break;
|
||||
case GGML_OP_SCALE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
@@ -0,0 +1,283 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
// This function requires the original shuffled weights.
|
||||
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
||||
// packed together in a byte, so are (q[1], q[17]) and so on.
|
||||
inline float block_q_4_0_dot_y_flat(
|
||||
global uchar * x,
|
||||
global half * dh,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il
|
||||
) {
|
||||
float d = *dh;
|
||||
global ushort * qs = ((global ushort *)x + il/2);
|
||||
float acc = 0.f;
|
||||
|
||||
acc += yl.s0 * (qs[0] & 0x000F);
|
||||
acc += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc += yl.s2 * (qs[1] & 0x000F);
|
||||
acc += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc += yl.sa * (qs[1] & 0x00F0);
|
||||
acc += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc += yl.s4 * (qs[2] & 0x000F);
|
||||
acc += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc += yl.sc * (qs[2] & 0x00F0);
|
||||
acc += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc += yl.s6 * (qs[3] & 0x000F);
|
||||
acc += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc += yl.se * (qs[3] & 0x00F0);
|
||||
acc += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
return d * (sumy * -8.f + acc);
|
||||
}
|
||||
|
||||
//
|
||||
// This variant outputs 8 values.
|
||||
//
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 8 // each SIMD group works on 8 rows
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // subgroup size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 8
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK4_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = 0;
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
// The number of scales is the same as the number of blocks.
|
||||
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
||||
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
||||
|
||||
global uchar * x = (global uchar *) src0_q + offset0_q;
|
||||
global half * d = (global half *) src0_d + offset0_d;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float8 sumf = 0.f;
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix*QK4_0 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0.f;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
||||
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
||||
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
||||
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
||||
|
||||
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
||||
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
||||
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
||||
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
||||
|
||||
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float8 tot = (float8)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
||||
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
||||
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
|
||||
if (first_row + 4 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
||||
}
|
||||
if (first_row + 5 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
||||
}
|
||||
if (first_row + 6 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
||||
}
|
||||
if (first_row + 7 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int ne20,
|
||||
int ne21,
|
||||
ulong nb21,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float *)((global char *)src1 + offset1);
|
||||
src2 = (global char *)((global char *)src2 + offset2);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
const int iid1 = get_group_id(2)/ne20;
|
||||
const int idx = get_group_id(2)%ne20;
|
||||
|
||||
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
||||
|
||||
const int i11 = idx%ne11;
|
||||
const int i12 = iid1;
|
||||
|
||||
const int i1 = idx;
|
||||
const int i2 = i12;
|
||||
|
||||
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
||||
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
||||
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
||||
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
||||
|
||||
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -2425,8 +2425,6 @@ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_REST
|
||||
}
|
||||
}
|
||||
|
||||
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
const int64_t nb = k / QK4_NL;
|
||||
|
||||
@@ -53,6 +53,9 @@ struct socket_t {
|
||||
}
|
||||
};
|
||||
|
||||
// macro for nicer error messages on server crash
|
||||
#define RPC_STATUS_ASSERT(x) if (!(x)) GGML_ABORT("Remote RPC server crashed or returned malformed response")
|
||||
|
||||
// all RPC structures must be packed
|
||||
#pragma pack(push, 1)
|
||||
// ggml_tensor is serialized into rpc_tensor
|
||||
@@ -425,7 +428,7 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_hello_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
||||
fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
return false;
|
||||
@@ -481,7 +484,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -493,7 +496,7 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
|
||||
rpc_msg_buffer_get_base_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
|
||||
return ctx->base_ptr;
|
||||
}
|
||||
@@ -545,7 +548,7 @@ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_
|
||||
request.tensor = serialize_tensor(tensor);
|
||||
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -560,7 +563,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
request.hash = fnv_hash((const uint8_t*)data, size);
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.result) {
|
||||
// the server has the same data, no need to send it
|
||||
return;
|
||||
@@ -573,7 +576,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
@@ -583,7 +586,7 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
|
||||
request.offset = offset;
|
||||
request.size = size;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
@@ -601,7 +604,7 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||
request.dst = serialize_tensor(dst);
|
||||
rpc_msg_copy_tensor_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return response.result;
|
||||
}
|
||||
|
||||
@@ -609,7 +612,7 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
||||
@@ -635,7 +638,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
||||
rpc_msg_alloc_buffer_rsp response;
|
||||
auto sock = get_socket(buft_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
@@ -650,7 +653,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
||||
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_get_alignment_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return response.alignment;
|
||||
}
|
||||
|
||||
@@ -662,7 +665,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
|
||||
static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_get_max_size_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return response.max_size;
|
||||
}
|
||||
|
||||
@@ -683,7 +686,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
||||
|
||||
rpc_msg_get_alloc_size_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
|
||||
return response.alloc_size;
|
||||
} else {
|
||||
@@ -761,7 +764,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
||||
rpc_msg_graph_compute_rsp response;
|
||||
auto sock = get_socket(rpc_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return (enum ggml_status)response.result;
|
||||
}
|
||||
|
||||
@@ -835,7 +838,7 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||
static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
|
||||
rpc_msg_get_device_memory_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
*free = response.free_mem;
|
||||
*total = response.total_mem;
|
||||
}
|
||||
|
||||
@@ -142,7 +142,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
ONEMATH
|
||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
||||
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
|
||||
GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
|
||||
)
|
||||
FetchContent_MakeAvailable(ONEMATH)
|
||||
# Create alias to match with find_package targets name
|
||||
|
||||
@@ -149,8 +149,6 @@ typedef sycl::float2 dfloat2;
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8
|
||||
|
||||
static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
static int g_all_sycl_device_count = -1;
|
||||
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
@@ -515,9 +513,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
|
||||
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
|
||||
template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
return "";
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << prefix << "=[";
|
||||
@@ -528,29 +526,26 @@ template <int N, class T> void debug_print_array(const std::string & prefix, con
|
||||
ss << array[N - 1];
|
||||
}
|
||||
ss << "]";
|
||||
GGML_SYCL_DEBUG("%s", ss.str().c_str());
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
|
||||
const std::string & suffix = "") {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s=", prefix.c_str());
|
||||
inline std::string debug_get_tensor_str(const std::string &prefix,
|
||||
const ggml_tensor *tensor, const std::string &suffix = "") {
|
||||
std::stringstream ss;
|
||||
if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
|
||||
ss << prefix.c_str() << "=";
|
||||
if (tensor) {
|
||||
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
|
||||
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
|
||||
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
|
||||
if (!ggml_is_contiguous(tensor)) {
|
||||
GGML_SYCL_DEBUG(";strided");
|
||||
}
|
||||
if (ggml_is_permuted(tensor)) {
|
||||
GGML_SYCL_DEBUG(";permuted");
|
||||
}
|
||||
ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
|
||||
ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
|
||||
ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
|
||||
|
||||
if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
|
||||
if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("nullptr");
|
||||
ss << "nullptr";
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s", suffix.c_str());
|
||||
ss << suffix;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Use scope_op_debug_print to log operations coming from running a model
|
||||
@@ -566,10 +561,10 @@ struct scope_op_debug_print {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
||||
debug_print_tensor(" dst", dst);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
|
||||
if (dst) {
|
||||
for (std::size_t i = 0; i < num_src; ++i) {
|
||||
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
|
||||
}
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
||||
|
||||
@@ -723,8 +723,7 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
||||
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
||||
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
|
||||
std::string(" src0 type=") + ggml_type_name(src0->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0));
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
|
||||
@@ -65,6 +65,9 @@ public:
|
||||
|
||||
dnnl::primitive_attr primitive_attr;
|
||||
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
#ifdef GGML_SYCL_F16
|
||||
primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
|
||||
#endif
|
||||
|
||||
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
||||
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
||||
|
||||
@@ -347,7 +347,7 @@ static enum ggml_status
|
||||
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
@@ -385,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
@@ -413,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
@@ -444,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *dst) try {
|
||||
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
||||
@@ -525,7 +525,7 @@ catch (sycl::exception const &exc) {
|
||||
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
||||
size_t offset, size_t size) {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
||||
@@ -805,7 +805,7 @@ static enum ggml_status
|
||||
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
@@ -891,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor, const void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
@@ -947,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
@@ -2127,21 +2127,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
|
||||
? (const sycl::half *)src1->data + src1_padded_row_size
|
||||
: src1_as_f16.get();
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting dst to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
||||
|
||||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
@@ -3866,7 +3863,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
@@ -3887,7 +3884,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
@@ -3910,8 +3907,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
||||
ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
/*
|
||||
|
||||
@@ -49,15 +49,7 @@ if (Vulkan_FOUND)
|
||||
../../include/ggml-vulkan.h
|
||||
)
|
||||
|
||||
set(VULKAN_SHADER_GEN_CMAKE_ARGS
|
||||
-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
|
||||
-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
|
||||
)
|
||||
|
||||
set(VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS "")
|
||||
if (CMAKE_BUILD_TYPE AND CMAKE_BUILD_TYPE MATCHES "Debug|Release|MinSizeRel|RelWithDebInfo")
|
||||
list(APPEND VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS --config=${CMAKE_BUILD_TYPE})
|
||||
endif()
|
||||
set(VULKAN_SHADER_GEN_CMAKE_ARGS "")
|
||||
|
||||
# Test all shader extensions
|
||||
test_shader_extension_support(
|
||||
@@ -136,42 +128,39 @@ if (Vulkan_FOUND)
|
||||
set(HOST_CMAKE_TOOLCHAIN_FILE "")
|
||||
endif()
|
||||
|
||||
# Always use ExternalProject_Add approach
|
||||
include(ExternalProject)
|
||||
|
||||
# Add toolchain file if cross-compiling
|
||||
if (CMAKE_CROSSCOMPILING)
|
||||
list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE})
|
||||
message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
|
||||
endif()
|
||||
|
||||
# Native build through ExternalProject_Add
|
||||
ExternalProject_Add(
|
||||
vulkan-shaders-gen
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
|
||||
CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS}
|
||||
BUILD_COMMAND ${CMAKE_COMMAND} --build . ${VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS}
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} --install .
|
||||
INSTALL_DIR ${CMAKE_BINARY_DIR}
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/$<CONFIG>
|
||||
-DCMAKE_INSTALL_BINDIR=.
|
||||
-DCMAKE_BUILD_TYPE=$<CONFIG>
|
||||
${VULKAN_SHADER_GEN_CMAKE_ARGS}
|
||||
|
||||
BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $<CONFIG>
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} --install . --config $<CONFIG>
|
||||
)
|
||||
ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
|
||||
|
||||
set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
|
||||
set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix})
|
||||
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
|
||||
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
|
||||
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
|
||||
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
|
||||
set (_ggml_vk_genshaders_dir "${CMAKE_BINARY_DIR}/$<CONFIG>")
|
||||
set (_ggml_vk_genshaders_cmd "${_ggml_vk_genshaders_dir}/vulkan-shaders-gen${_ggml_vk_host_suffix}")
|
||||
set (_ggml_vk_header "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp")
|
||||
set (_ggml_vk_source "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp")
|
||||
set (_ggml_vk_input_dir "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders")
|
||||
set (_ggml_vk_output_dir "${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv")
|
||||
|
||||
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
||||
set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen)
|
||||
|
||||
# Add build and install dependencies for all builds
|
||||
set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)
|
||||
file(GLOB _ggml_vk_shader_files CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.comp")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${_ggml_vk_header}
|
||||
${_ggml_vk_source}
|
||||
${_ggml_vk_source}
|
||||
|
||||
COMMAND ${_ggml_vk_genshaders_cmd}
|
||||
--glslc ${Vulkan_GLSLC_EXECUTABLE}
|
||||
@@ -181,7 +170,11 @@ if (Vulkan_FOUND)
|
||||
--target-cpp ${_ggml_vk_source}
|
||||
--no-clean
|
||||
|
||||
DEPENDS ${_ggml_vk_shader_deps}
|
||||
DEPENDS ${_ggml_vk_shader_files}
|
||||
vulkan-shaders-gen
|
||||
vulkan-shaders-gen-build
|
||||
vulkan-shaders-gen-install
|
||||
|
||||
COMMENT "Generate vulkan shaders"
|
||||
)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -25,15 +25,3 @@ add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
||||
|
||||
# Configure output directories for MSVC builds
|
||||
if(MSVC)
|
||||
# Get the main project's runtime output directory if possible
|
||||
if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
||||
foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string(TOUPPER ${CONFIG} CONFIG)
|
||||
set_target_properties(${TARGET} PROPERTIES
|
||||
RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
endforeach()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -291,6 +291,7 @@ class MODEL_ARCH(IntEnum):
|
||||
BERT = auto()
|
||||
NOMIC_BERT = auto()
|
||||
NOMIC_BERT_MOE = auto()
|
||||
NEO_BERT = auto()
|
||||
JINA_BERT_V2 = auto()
|
||||
BLOOM = auto()
|
||||
STABLELM = auto()
|
||||
@@ -343,6 +344,8 @@ class MODEL_ARCH(IntEnum):
|
||||
WAVTOKENIZER_DEC = auto()
|
||||
PLM = auto()
|
||||
BAILINGMOE = auto()
|
||||
DOTS1 = auto()
|
||||
ARCEE = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
@@ -571,6 +574,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.BERT: "bert",
|
||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
||||
MODEL_ARCH.NEO_BERT: "neo-bert",
|
||||
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||
MODEL_ARCH.BLOOM: "bloom",
|
||||
MODEL_ARCH.STABLELM: "stablelm",
|
||||
@@ -623,6 +627,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
||||
MODEL_ARCH.PLM: "plm",
|
||||
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
||||
MODEL_ARCH.DOTS1: "dots1",
|
||||
MODEL_ARCH.ARCEE: "arcee",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
@@ -1077,6 +1083,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||
],
|
||||
MODEL_ARCH.NEO_BERT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||
MODEL_TENSOR.CLS,
|
||||
MODEL_TENSOR.CLS_OUT,
|
||||
],
|
||||
MODEL_ARCH.JINA_BERT_V2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
@@ -2044,6 +2062,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.DOTS1: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.ARCEE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -271,7 +271,7 @@ class GGUFWriter:
|
||||
|
||||
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
|
||||
if any(key in kv_data for kv_data in self.kv_data):
|
||||
raise ValueError(f'Duplicated key name {key!r}')
|
||||
logger.warning(f'Duplicated key name {key!r}, overwriting it with new value {val!r} of type {vtype.name}')
|
||||
|
||||
self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@ class TensorNameMap:
|
||||
"model.embeddings", # rwkv7
|
||||
"model.word_embeddings", # bailingmoe
|
||||
"language_model.model.embed_tokens", # llama4
|
||||
"encoder", # neobert
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@@ -134,6 +135,7 @@ class TensorNameMap:
|
||||
"rwkv.blocks.{bid}.ln1", # rwkv6
|
||||
"model.layers.{bid}.ln1", # rwkv7
|
||||
"model.layers.{bid}.input_layernorm", # llama4
|
||||
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -161,6 +163,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||
"transformer_encoder.{bid}.qkv", # neobert
|
||||
),
|
||||
|
||||
# Attention query
|
||||
@@ -236,6 +239,7 @@ class TensorNameMap:
|
||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
"transformer_encoder.{bid}.wo", # neobert
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
@@ -276,6 +280,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"transformer_encoder.{bid}.ffn_norm", # neobert
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
@@ -305,7 +310,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B: (
|
||||
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
|
||||
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
@@ -333,11 +338,14 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used)
|
||||
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
|
||||
"encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
|
||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
||||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||
"model.layers.{bid}.feed_forward.up_proj", # llama4
|
||||
"transformer_encoder.{bid}.ffn.w12", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -370,7 +378,7 @@ class TensorNameMap:
|
||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
|
||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||
@@ -420,6 +428,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
||||
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
||||
"model.layers.{bid}.feed_forward.down_proj", # llama4
|
||||
"transformer_encoder.{bid}.ffn.w3", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@@ -830,12 +839,14 @@ class TensorNameMap:
|
||||
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||
"encoder.final_layer_norm", # t5
|
||||
"layer_norm", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CLS: (
|
||||
"classifier", # jina
|
||||
"classifier.dense", # roberta
|
||||
"pre_classifier", # distillbert
|
||||
"dense", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CLS_OUT: (
|
||||
|
||||
+10
-7
@@ -243,18 +243,21 @@ extern "C" {
|
||||
|
||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||
|
||||
// Input data for llama_decode
|
||||
// Input data for llama_encode/llama_decode
|
||||
// A llama_batch object can contain input about one or many sequences
|
||||
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
||||
//
|
||||
// - token : the token ids of the input (used when embd is NULL)
|
||||
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
||||
// - pos : the positions of the respective token in the sequence
|
||||
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
||||
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
||||
// - seq_id : the sequence to which the respective token belongs
|
||||
// (if set to NULL, the sequence ID will be assumed to be 0)
|
||||
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
||||
// (if set to NULL, only the logits for last token will be returned)
|
||||
// (if set to NULL:
|
||||
// - if embeddings: all tokens are output
|
||||
// - if not: only the last token is output
|
||||
// )
|
||||
//
|
||||
typedef struct llama_batch {
|
||||
int32_t n_tokens;
|
||||
@@ -262,8 +265,8 @@ extern "C" {
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
|
||||
llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
int8_t * logits; // TODO: rename this to "output"
|
||||
} llama_batch;
|
||||
|
||||
@@ -961,8 +964,8 @@ extern "C" {
|
||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
|
||||
// Set whether the model is in embeddings mode or not
|
||||
// If true, embeddings will be returned but logits will not
|
||||
// Set whether the context outputs embeddings or not
|
||||
// TODO: rename to avoid confusion with llama_get_embeddings()
|
||||
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||
|
||||
// Set whether to use causal attention or not
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
tabulate~=0.9.0
|
||||
GitPython~=3.1.43
|
||||
matplotlib~=3.10.0
|
||||
|
||||
@@ -19,6 +19,7 @@ except ImportError as e:
|
||||
print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
|
||||
raise e
|
||||
|
||||
|
||||
logger = logging.getLogger("compare-llama-bench")
|
||||
|
||||
# All llama-bench SQL fields
|
||||
@@ -122,11 +123,15 @@ help_s = (
|
||||
parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
|
||||
parser.add_argument("-s", "--show", help=help_s)
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
parser.add_argument("--plot", help="generate a performance comparison plot and save to specified file (e.g., plot.png)")
|
||||
parser.add_argument("--plot_x", help="parameter to use as x axis for plotting (default: n_depth)", default="n_depth")
|
||||
parser.add_argument("--plot_log_scale", action="store_true", help="use log scale for x axis in plots (off by default)")
|
||||
|
||||
known_args, unknown_args = parser.parse_known_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
||||
|
||||
|
||||
if known_args.check:
|
||||
# Check if all required Python libraries are installed. Would have failed earlier if not.
|
||||
sys.exit(0)
|
||||
@@ -499,7 +504,6 @@ else:
|
||||
|
||||
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
||||
|
||||
|
||||
# If the user provided columns to group the results by, use them:
|
||||
if known_args.show is not None:
|
||||
show = known_args.show.split(",")
|
||||
@@ -544,6 +548,14 @@ else:
|
||||
show.remove(prop)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Add plot_x parameter to parameters to show if it's not already present:
|
||||
if known_args.plot:
|
||||
for k, v in PRETTY_NAMES.items():
|
||||
if v == known_args.plot_x and k not in show:
|
||||
show.append(k)
|
||||
break
|
||||
|
||||
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
||||
|
||||
if not rows_show:
|
||||
@@ -600,6 +612,161 @@ if "gpu_info" in show:
|
||||
headers = [PRETTY_NAMES[p] for p in show]
|
||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||
|
||||
if known_args.plot:
|
||||
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
except ImportError as e:
|
||||
logger.error("matplotlib is required for --plot.")
|
||||
raise e
|
||||
|
||||
data_headers = headers[:-4] # Exclude the last 4 columns (Test, baseline t/s, compare t/s, Speedup)
|
||||
plot_x_index = None
|
||||
plot_x_label = plot_x_param
|
||||
|
||||
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
||||
pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
||||
if pretty_name in data_headers:
|
||||
plot_x_index = data_headers.index(pretty_name)
|
||||
plot_x_label = pretty_name
|
||||
elif plot_x_param in data_headers:
|
||||
plot_x_index = data_headers.index(plot_x_param)
|
||||
plot_x_label = plot_x_param
|
||||
else:
|
||||
logger.error(f"Parameter '{plot_x_param}' not found in current table columns. Available columns: {', '.join(data_headers)}")
|
||||
return
|
||||
|
||||
grouped_data = {}
|
||||
|
||||
for i, row in enumerate(table_data):
|
||||
group_key_parts = []
|
||||
test_name = row[-4]
|
||||
|
||||
base_test = ""
|
||||
x_value = None
|
||||
|
||||
if plot_x_param in ["n_prompt", "n_gen", "n_depth"]:
|
||||
for j, val in enumerate(row[:-4]):
|
||||
header_name = data_headers[j]
|
||||
if val is not None and str(val).strip():
|
||||
group_key_parts.append(f"{header_name}={val}")
|
||||
|
||||
if plot_x_param == "n_prompt" and "pp" in test_name:
|
||||
base_test = test_name.split("@")[0]
|
||||
x_value = base_test
|
||||
elif plot_x_param == "n_gen" and "tg" in test_name:
|
||||
x_value = test_name.split("@")[0]
|
||||
elif plot_x_param == "n_depth" and "@d" in test_name:
|
||||
base_test = test_name.split("@d")[0]
|
||||
x_value = int(test_name.split("@d")[1])
|
||||
else:
|
||||
base_test = test_name
|
||||
|
||||
if base_test.strip():
|
||||
group_key_parts.append(f"Test={base_test}")
|
||||
else:
|
||||
for j, val in enumerate(row[:-4]):
|
||||
if j != plot_x_index:
|
||||
header_name = data_headers[j]
|
||||
if val is not None and str(val).strip():
|
||||
group_key_parts.append(f"{header_name}={val}")
|
||||
else:
|
||||
x_value = val
|
||||
|
||||
group_key_parts.append(f"Test={test_name}")
|
||||
|
||||
group_key = tuple(group_key_parts)
|
||||
|
||||
if group_key not in grouped_data:
|
||||
grouped_data[group_key] = []
|
||||
|
||||
grouped_data[group_key].append({
|
||||
'x_value': x_value,
|
||||
'baseline': float(row[-3]),
|
||||
'compare': float(row[-2]),
|
||||
'speedup': float(row[-1])
|
||||
})
|
||||
|
||||
if not grouped_data:
|
||||
logger.error("No data available for plotting")
|
||||
return
|
||||
|
||||
def make_axes(num_groups, max_cols=2, base_size=(8, 4)):
|
||||
from math import ceil
|
||||
cols = 1 if num_groups == 1 else min(max_cols, num_groups)
|
||||
rows = ceil(num_groups / cols)
|
||||
|
||||
# Scale figure size by grid dimensions
|
||||
w, h = base_size
|
||||
fig, ax_arr = plt.subplots(rows, cols,
|
||||
figsize=(w * cols, h * rows),
|
||||
squeeze=False)
|
||||
|
||||
axes = ax_arr.flatten()[:num_groups]
|
||||
return fig, axes
|
||||
|
||||
num_groups = len(grouped_data)
|
||||
fig, axes = make_axes(num_groups)
|
||||
|
||||
plot_idx = 0
|
||||
|
||||
for group_key, points in grouped_data.items():
|
||||
if plot_idx >= len(axes):
|
||||
break
|
||||
ax = axes[plot_idx]
|
||||
|
||||
try:
|
||||
points_sorted = sorted(points, key=lambda p: float(p['x_value']) if p['x_value'] is not None else 0)
|
||||
x_values = [float(p['x_value']) if p['x_value'] is not None else 0 for p in points_sorted]
|
||||
except ValueError:
|
||||
points_sorted = sorted(points, key=lambda p: group_key)
|
||||
x_values = [p['x_value'] for p in points_sorted]
|
||||
|
||||
baseline_vals = [p['baseline'] for p in points_sorted]
|
||||
compare_vals = [p['compare'] for p in points_sorted]
|
||||
|
||||
ax.plot(x_values, baseline_vals, 'o-', color='skyblue',
|
||||
label=f'{baseline_name}', linewidth=2, markersize=6)
|
||||
ax.plot(x_values, compare_vals, 's--', color='lightcoral', alpha=0.8,
|
||||
label=f'{compare_name}', linewidth=2, markersize=6)
|
||||
|
||||
if log_scale:
|
||||
ax.set_xscale('log', base=2)
|
||||
unique_x = sorted(set(x_values))
|
||||
ax.set_xticks(unique_x)
|
||||
ax.set_xticklabels([str(int(x)) for x in unique_x])
|
||||
|
||||
title_parts = []
|
||||
for part in group_key:
|
||||
if '=' in part:
|
||||
key, value = part.split('=', 1)
|
||||
title_parts.append(f"{key}: {value}")
|
||||
|
||||
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
||||
|
||||
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
||||
ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
|
||||
ax.set_title(title, fontsize=12, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plot_idx += 1
|
||||
|
||||
for i in range(plot_idx, len(axes)):
|
||||
axes[i].set_visible(False)
|
||||
|
||||
fig.suptitle(f'Performance comparison: {compare_name} vs. {baseline_name}',
|
||||
fontsize=14, fontweight='bold')
|
||||
fig.subplots_adjust(top=1)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
|
||||
|
||||
print(tabulate( # noqa: NP100
|
||||
table,
|
||||
headers=headers,
|
||||
|
||||
@@ -1 +1 @@
|
||||
94a83ba5a725ae2aee79df75dd99b2119d0478cc
|
||||
6a7d170c04789f6ebcf320ed03c1b16973f93bd7
|
||||
|
||||
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_BERT, "bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
@@ -72,6 +73,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
{ LLM_ARCH_DOTS1, "dots1" },
|
||||
{ LLM_ARCH_ARCEE, "arcee" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -243,6 +246,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_ARCEE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_LLAMA4,
|
||||
{
|
||||
@@ -494,6 +515,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_NEO_BERT,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
||||
{ LLM_TENSOR_CLS, "cls" },
|
||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
{
|
||||
@@ -1555,6 +1591,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DOTS1,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
||||
@@ -24,6 +24,7 @@ enum llm_arch {
|
||||
LLM_ARCH_BERT,
|
||||
LLM_ARCH_NOMIC_BERT,
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
LLM_ARCH_NEO_BERT,
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
@@ -76,6 +77,8 @@ enum llm_arch {
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
LLM_ARCH_DOTS1,
|
||||
LLM_ARCH_ARCEE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
+272
-21
@@ -1,8 +1,14 @@
|
||||
#include "llama-batch.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-memory.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
|
||||
// clear empty sequences
|
||||
@@ -105,12 +111,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
|
||||
ubatch.seq_id = batch->seq_id + seq.offset;
|
||||
}
|
||||
}
|
||||
if (logits_all) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
ubatch.output[ubatch.n_tokens + i] = 1;
|
||||
out_ids.push_back(ids[seq.offset + i]);
|
||||
}
|
||||
} else if (batch->logits) {
|
||||
if (batch->logits) {
|
||||
if (ubatch.equal_seqs) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
size_t id = ids[seq.offset + i];
|
||||
@@ -197,11 +198,10 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split) {
|
||||
GGML_ASSERT(batch.n_tokens >= 0);
|
||||
this->batch = &batch;
|
||||
this->n_embd = n_embd;
|
||||
this->logits_all = logits_all;
|
||||
|
||||
n_tokens = batch.n_tokens;
|
||||
ids.resize(n_tokens);
|
||||
@@ -285,17 +285,56 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
||||
);
|
||||
}
|
||||
|
||||
llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
|
||||
batch = in_batch;
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
if (!batch.pos) {
|
||||
assert(p0 >= 0);
|
||||
pos.resize(batch.n_tokens);
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i] = p0 + i;
|
||||
}
|
||||
batch.pos = pos.data();
|
||||
llama_batch_allocr::llama_batch_allocr() {
|
||||
const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
|
||||
debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
|
||||
|
||||
seq_pos.resize(LLAMA_MAX_SEQ);
|
||||
seq_cpl.resize(LLAMA_MAX_SEQ);
|
||||
for (auto & cur : seq_cpl) {
|
||||
cur.resize(LLAMA_MAX_SEQ);
|
||||
}
|
||||
}
|
||||
|
||||
bool llama_batch_allocr::init(
|
||||
const llama_batch & batch_inp,
|
||||
const llama_vocab & vocab,
|
||||
const llama_memory_i * memory,
|
||||
bool embd_all) {
|
||||
clear();
|
||||
|
||||
batch = batch_inp;
|
||||
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
|
||||
//
|
||||
// validate input batch
|
||||
//
|
||||
|
||||
if (batch.token) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.seq_id) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// auto-generate missing fields
|
||||
//
|
||||
|
||||
if (!batch.n_seq_id) {
|
||||
n_seq_id.resize(batch.n_tokens);
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
@@ -303,6 +342,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||
}
|
||||
batch.n_seq_id = n_seq_id.data();
|
||||
}
|
||||
|
||||
if (!batch.seq_id) {
|
||||
seq_id.resize(batch.n_tokens + 1);
|
||||
seq_id[batch.n_tokens] = NULL;
|
||||
@@ -311,10 +351,221 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||
}
|
||||
batch.seq_id = seq_id.data();
|
||||
}
|
||||
|
||||
if (!batch.pos) {
|
||||
pos.resize(batch.n_tokens);
|
||||
|
||||
// initialize the starting position for each sequence based on the positions in the memory
|
||||
llama_pos p0[LLAMA_MAX_SEQ];
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (!memory) {
|
||||
p0[s] = 0;
|
||||
} else {
|
||||
p0[s] = memory->seq_pos_max(s) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
|
||||
pos[i] = p0[seq_id];
|
||||
|
||||
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
p0[batch.seq_id[i][s]] = pos[i] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
batch.pos = pos.data();
|
||||
}
|
||||
|
||||
if (!batch.logits) {
|
||||
logits.resize(batch.n_tokens);
|
||||
logits[logits.size() - 1] = true;
|
||||
batch.logits = logits.data();
|
||||
if (embd_all) {
|
||||
// return the output for all tokens
|
||||
output.resize(batch.n_tokens, true);
|
||||
} else {
|
||||
// return the output only for the last token
|
||||
output.resize(batch.n_tokens, false);
|
||||
output[output.size() - 1] = true;
|
||||
}
|
||||
|
||||
batch.logits = output.data();
|
||||
} else if (embd_all) {
|
||||
bool warn = false;
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
if (batch.logits[i] == 0) {
|
||||
warn = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (warn) {
|
||||
LLAMA_LOG_WARN("%s: embeddings required but some input tokens were not marked as outputs -> overriding\n", __func__);
|
||||
|
||||
output.resize(batch.n_tokens, true);
|
||||
batch.logits = output.data();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// compute stats
|
||||
//
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
n_outputs += batch.logits[i] != 0;
|
||||
}
|
||||
|
||||
// determine coupled sequences
|
||||
// these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
seq_pos[batch.seq_id[i][s]].insert(batch.pos[i]);
|
||||
|
||||
if (s > 0) {
|
||||
const llama_seq_id s0 = batch.seq_id[i][0];
|
||||
const llama_seq_id s1 = batch.seq_id[i][s];
|
||||
|
||||
// mark that sequence s1 is coupled to s0
|
||||
seq_cpl[s1][s0] = true;
|
||||
|
||||
// note: the other way around is not necessary for now
|
||||
//seq_cpl[s0][s1] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, batch.n_tokens);
|
||||
LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) batch.token);
|
||||
LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) batch.embd);
|
||||
LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) batch.pos);
|
||||
LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) batch.n_seq_id);
|
||||
LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) batch.seq_id);
|
||||
LLAMA_LOG_DEBUG("%s: logits = %p\n", __func__, (void *) batch.logits);
|
||||
LLAMA_LOG_DEBUG("%s: n_outputs = %d\n", __func__, n_outputs);
|
||||
|
||||
if (debug > 1) {
|
||||
int seq_id_max = 0;
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
seq_id_max = std::max(seq_id_max, batch.seq_id[i][s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
++seq_id_max;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: token = [\n", __func__);
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
std::vector<int8_t> seq_id(seq_id_max);
|
||||
|
||||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
seq_id[batch.seq_id[i][s]] = 1;
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
for (int s = 0; s < seq_id_max; ++s) {
|
||||
if (seq_id[s]) {
|
||||
ss << s%10;
|
||||
} else {
|
||||
ss << ".";
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
|
||||
__func__, i, batch.token[i], vocab.token_to_piece(batch.token[i]).c_str(),
|
||||
batch.pos[i], batch.n_seq_id[i], ss.str().c_str(), batch.logits[i]);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: seq = [\n", __func__);
|
||||
for (int s0 = 0; s0 < (int) seq_pos.size(); ++s0) {
|
||||
if (seq_pos[s0].empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
for (int s1 = 0; s1 < (int) seq_cpl[s0].size(); ++s1) {
|
||||
if (seq_cpl[s0][s1]) {
|
||||
ss << s1 << " ";
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: %4d: pos = [%4d, %4d], cpl = %s\n",
|
||||
__func__, s0, seq_pos_min(s0), seq_pos_max(s0), ss.str().empty() ? "-" : ss.str().c_str());
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// consistency checks
|
||||
//
|
||||
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq_pos[s].empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (memory && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (memory) {
|
||||
for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
|
||||
for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
|
||||
if (seq_cpl[s0][s1]) {
|
||||
if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
|
||||
memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d is coupled to %d in the input batch, but have divereged\n", __func__, s0, s1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const llama_batch & llama_batch_allocr::get_batch() const {
|
||||
return batch;
|
||||
}
|
||||
|
||||
uint32_t llama_batch_allocr::get_n_outputs() const {
|
||||
return n_outputs;
|
||||
}
|
||||
|
||||
llama_pos llama_batch_allocr::seq_pos_min(llama_seq_id seq_id) const {
|
||||
return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].begin();
|
||||
}
|
||||
|
||||
llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].rbegin();
|
||||
}
|
||||
|
||||
void llama_batch_allocr::clear() {
|
||||
n_outputs = 0;
|
||||
|
||||
batch = {};
|
||||
pos.clear();
|
||||
n_seq_id.clear();
|
||||
seq_id.clear();
|
||||
output.clear();
|
||||
|
||||
for (auto & cur : seq_pos) {
|
||||
cur.clear();
|
||||
}
|
||||
|
||||
for (auto & cur : seq_cpl) {
|
||||
std::fill(cur.begin(), cur.end(), false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+36
-11
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
// very similar to llama_batch,
|
||||
// but has more metadata about sequences
|
||||
@@ -18,8 +19,8 @@ struct llama_ubatch {
|
||||
llama_token * token; // [n_tokens]
|
||||
float * embd; // [n_embd, n_tokens]
|
||||
llama_pos * pos; // [n_tokens]
|
||||
int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
|
||||
llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id;
|
||||
int32_t * n_seq_id; // [n_seqs]
|
||||
llama_seq_id ** seq_id; // [n_seqs]
|
||||
int8_t * output; // [n_tokens]
|
||||
};
|
||||
|
||||
@@ -39,8 +40,6 @@ struct llama_sbatch {
|
||||
|
||||
size_t n_embd;
|
||||
|
||||
bool logits_all; // TODO: remove once lctx.logits_all is removed too
|
||||
|
||||
// sorted indices into the batch
|
||||
std::vector<int64_t> ids;
|
||||
// batch indices of the output
|
||||
@@ -76,19 +75,45 @@ struct llama_sbatch {
|
||||
llama_ubatch split_seq(size_t n_ubatch);
|
||||
|
||||
llama_sbatch() = default;
|
||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false);
|
||||
};
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
struct llama_batch_allocr {
|
||||
struct llama_batch batch;
|
||||
// a helper for sanitizing and fulfilling a batch
|
||||
class llama_batch_allocr {
|
||||
public:
|
||||
llama_batch_allocr();
|
||||
|
||||
// sanitize and auto-gen missing data in the input batch
|
||||
// memory is optional. if provided will be used to check for sequence continuity and to determine the positions
|
||||
bool init(
|
||||
const llama_batch & batch_inp,
|
||||
const llama_vocab & vocab,
|
||||
const llama_memory_i * memory,
|
||||
bool embd_all);
|
||||
|
||||
const llama_batch & get_batch() const;
|
||||
|
||||
uint32_t get_n_outputs() const;
|
||||
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const;
|
||||
|
||||
private:
|
||||
void clear();
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
|
||||
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<int8_t> logits;
|
||||
std::vector<int8_t> output;
|
||||
|
||||
// optionally fulfill the batch returned by llama_batch_get_one
|
||||
llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
|
||||
std::vector<std::set<llama_pos>> seq_pos; // seq_pos[s]: the set of positions in sequence s
|
||||
std::vector<std::vector<bool>> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
|
||||
|
||||
int debug;
|
||||
};
|
||||
|
||||
@@ -183,6 +183,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_BAILING;
|
||||
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
||||
return LLM_CHAT_TEMPLATE_LLAMA4;
|
||||
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
||||
return LLM_CHAT_TEMPLATE_DOTS1;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@@ -643,6 +645,21 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "Assistant:";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
|
||||
// dots.llm1.inst (DOTS1)
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << "<|system|>" << message->content << "<|endofsystem|>";
|
||||
} else if (role == "user") {
|
||||
ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
|
||||
} else {
|
||||
ss << "<|response|>" << message->content << "<|endofresponse|>";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|response|>";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
||||
@@ -43,6 +43,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_BAILING,
|
||||
LLM_CHAT_TEMPLATE_LLAMA4,
|
||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
||||
LLM_CHAT_TEMPLATE_DOTS1,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
+79
-110
@@ -1,6 +1,7 @@
|
||||
#include "llama-context.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-memory.h"
|
||||
#include "llama-mmap.h"
|
||||
@@ -18,7 +19,8 @@
|
||||
llama_context::llama_context(
|
||||
const llama_model & model,
|
||||
llama_context_params params) :
|
||||
model(model) {
|
||||
model(model),
|
||||
batch_allocr(std::make_unique<llama_batch_allocr>()) {
|
||||
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
||||
|
||||
t_start_us = model.t_start_us;
|
||||
@@ -27,8 +29,8 @@ llama_context::llama_context(
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||
if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
|
||||
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
|
||||
if (cparams.n_seq_max > LLAMA_MAX_SEQ) {
|
||||
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
|
||||
}
|
||||
|
||||
cparams.n_threads = params.n_threads;
|
||||
@@ -494,7 +496,7 @@ float * llama_context::get_logits() {
|
||||
}
|
||||
|
||||
float * llama_context::get_logits_ith(int32_t i) {
|
||||
int32_t j = -1;
|
||||
int64_t j = -1;
|
||||
|
||||
try {
|
||||
if (logits == nullptr) {
|
||||
@@ -517,7 +519,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
}
|
||||
if (j >= n_outputs) {
|
||||
// This should not happen
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
return logits + j*model.vocab.n_tokens();
|
||||
@@ -536,7 +538,7 @@ float * llama_context::get_embeddings() {
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
int32_t j = -1;
|
||||
int64_t j = -1;
|
||||
|
||||
try {
|
||||
if (embd == nullptr) {
|
||||
@@ -559,7 +561,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
}
|
||||
if (j >= n_outputs) {
|
||||
// This should not happen
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
return embd + j*model.hparams.n_embd;
|
||||
@@ -719,52 +721,41 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
|
||||
return res;
|
||||
}
|
||||
|
||||
int llama_context::encode(llama_batch & inp_batch) {
|
||||
if (inp_batch.n_tokens == 0) {
|
||||
int llama_context::encode(const llama_batch & batch_inp) {
|
||||
if (batch_inp.n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
|
||||
if (!batch_allocr->init(batch_inp, model.vocab, nullptr, true)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
const int32_t n_tokens = batch.n_tokens;
|
||||
const llama_batch & batch = batch_allocr->get_batch();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
// TODO: move the validation to the llama_batch_allocr
|
||||
if (batch.token) {
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
throw -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
||||
GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
|
||||
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
||||
|
||||
if (t_compute_start_us == 0) {
|
||||
t_compute_start_us = ggml_time_us();
|
||||
}
|
||||
|
||||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
|
||||
n_queued_tokens += n_tokens;
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true);
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -774,7 +765,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
return -2;
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
output_ids[i] = i;
|
||||
}
|
||||
|
||||
@@ -830,7 +821,8 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
|
||||
|
||||
for (int32_t i = 0; i < n_tokens; i++) {
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||
continue;
|
||||
@@ -845,6 +837,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
auto & embd_seq_out = embd_seq;
|
||||
const uint32_t n_cls_out = hparams.n_cls_out;
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||
@@ -878,10 +871,10 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
// remember the sequence ids used during the encoding - needed for cross attention later
|
||||
cross.seq_ids_enc.resize(n_tokens);
|
||||
for (int32_t i = 0; i < n_tokens; i++) {
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
cross.seq_ids_enc[i].clear();
|
||||
for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
|
||||
llama_seq_id seq_id = ubatch.seq_id[i][s];
|
||||
for (int s = 0; s < batch.n_seq_id[i]; s++) {
|
||||
llama_seq_id seq_id = batch.seq_id[i][s];
|
||||
cross.seq_ids_enc[i].insert(seq_id);
|
||||
}
|
||||
}
|
||||
@@ -890,51 +883,45 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_context::decode(llama_batch & inp_batch) {
|
||||
int llama_context::decode(const llama_batch & batch_inp) {
|
||||
if (!memory) {
|
||||
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
|
||||
return encode(inp_batch);
|
||||
return encode(batch_inp);
|
||||
}
|
||||
|
||||
if (inp_batch.n_tokens == 0) {
|
||||
if (batch_inp.n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!inp_batch.pos) {
|
||||
if (inp_batch.seq_id) {
|
||||
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
// when computing embeddings, all tokens are output
|
||||
const bool embd_all = cparams.embeddings;
|
||||
|
||||
if (!batch_allocr->init(batch_inp, model.vocab, memory.get(), embd_all)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
const llama_batch & batch = batch_allocr->get_batch();
|
||||
|
||||
const auto & vocab = model.vocab;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int32_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const uint32_t n_tokens_all = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
// TODO: move the validation to the llama_batch_allocr
|
||||
if (batch.token) {
|
||||
for (int64_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
|
||||
|
||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
return -1;
|
||||
}
|
||||
if (embd_all) {
|
||||
// require that all tokens are output
|
||||
if (n_outputs_all != n_tokens_all) {
|
||||
LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
|
||||
__func__, n_outputs_all, n_tokens_all);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -947,25 +934,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
}
|
||||
n_queued_tokens += n_tokens_all;
|
||||
|
||||
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
||||
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
||||
|
||||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
|
||||
int64_t n_outputs_all = 0;
|
||||
|
||||
// count outputs
|
||||
if (batch.logits && !embd_pooled) {
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
} else if (embd_pooled) {
|
||||
n_outputs_all = n_tokens_all;
|
||||
} else {
|
||||
// keep last output only
|
||||
n_outputs_all = 1;
|
||||
}
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
// handle any pending defrags/shifts
|
||||
@@ -974,7 +945,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
llama_memory_state_ptr mstate;
|
||||
|
||||
while (true) {
|
||||
mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
|
||||
mstate = memory->init_batch(batch, cparams.n_ubatch, embd_all);
|
||||
if (!mstate) {
|
||||
return -2;
|
||||
}
|
||||
@@ -1018,7 +989,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||
return -2;
|
||||
};
|
||||
|
||||
@@ -1027,7 +998,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
do {
|
||||
const auto & ubatch = mstate->get_ubatch();
|
||||
|
||||
// count the outputs in this u_batch
|
||||
// count the outputs in this ubatch
|
||||
{
|
||||
int32_t n_outputs_new = 0;
|
||||
|
||||
@@ -1052,18 +1023,19 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
||||
llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
llama_pos pos_min[LLAMA_MAX_SEQ];
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
pos_min[s] = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
// TODO: fix sequence indexing
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||
const auto & seq_id = ubatch.seq_id[i][0];
|
||||
|
||||
pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
|
||||
continue;
|
||||
}
|
||||
@@ -1086,7 +1058,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
auto * t_logits = cparams.embeddings ? nullptr : res->get_logits();
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
@@ -1170,14 +1142,14 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
n_outputs = n_outputs_all;
|
||||
|
||||
// set output mappings
|
||||
{
|
||||
if (n_outputs > 0) {
|
||||
bool sorted_output = true;
|
||||
|
||||
auto & out_ids = mstate->out_ids();
|
||||
|
||||
GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
|
||||
GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
|
||||
|
||||
for (int64_t i = 0; i < n_outputs_all; ++i) {
|
||||
for (int64_t i = 0; i < n_outputs; ++i) {
|
||||
int64_t out_id = out_ids[i];
|
||||
output_ids[out_id] = i;
|
||||
if (out_id != i) {
|
||||
@@ -1189,20 +1161,22 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
int32_t j_min = i;
|
||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
||||
for (uint32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
uint32_t j_min = i;
|
||||
for (uint32_t j = i + 1; j < n_outputs; ++j) {
|
||||
if (out_ids[j] < out_ids[j_min]) {
|
||||
j_min = j;
|
||||
}
|
||||
}
|
||||
if (j_min == i) { continue; }
|
||||
if (j_min == i) {
|
||||
continue;
|
||||
}
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
@@ -1215,8 +1189,10 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
||||
|
||||
for (uint32_t i = 0; i < n_outputs; ++i) {
|
||||
output_ids[out_ids[i]] = i;
|
||||
}
|
||||
}
|
||||
@@ -1236,7 +1212,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// output
|
||||
//
|
||||
|
||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & vocab = model.vocab;
|
||||
|
||||
@@ -1246,9 +1222,8 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
bool has_logits = !cparams.embeddings;
|
||||
bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
|
||||
// TODO: hacky enc-dec support
|
||||
if (model.arch == LLM_ARCH_T5) {
|
||||
@@ -1302,8 +1277,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
// set all ids as invalid (negative)
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
|
||||
this->n_outputs = 0;
|
||||
this->n_outputs_max = n_outputs_max;
|
||||
this->n_outputs = 0;
|
||||
|
||||
return n_outputs_max;
|
||||
}
|
||||
@@ -1332,7 +1306,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
n_tokens = (n_tokens / n_seqs) * n_seqs;
|
||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||
n_outputs = std::min(n_outputs, n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
@@ -1794,14 +1768,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
|
||||
std::vector<int32_t> w_output_pos;
|
||||
|
||||
GGML_ASSERT(n_outputs <= n_outputs_max);
|
||||
|
||||
w_output_pos.resize(n_outputs);
|
||||
|
||||
// build a more compact representation of the output ids
|
||||
for (size_t i = 0; i < n_batch(); ++i) {
|
||||
// map an output id to a position in the batch
|
||||
int32_t pos = output_ids[i];
|
||||
int64_t pos = output_ids[i];
|
||||
if (pos >= 0) {
|
||||
GGML_ASSERT(pos < n_outputs);
|
||||
w_output_pos[pos] = i;
|
||||
@@ -2071,14 +2043,11 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
n_queued_tokens += n_tokens_all;
|
||||
|
||||
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
||||
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
||||
|
||||
embd_seq.clear();
|
||||
|
||||
int64_t n_outputs_all = n_tokens_all;
|
||||
uint32_t n_outputs_all = n_tokens_all;
|
||||
|
||||
auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true);
|
||||
auto mstate = memory->init_batch(batch, cparams.n_ubatch, true);
|
||||
if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
|
||||
break;
|
||||
@@ -2086,7 +2055,7 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||
GGML_ABORT("TODO: handle this error");
|
||||
};
|
||||
|
||||
|
||||
+8
-6
@@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
@@ -13,6 +12,7 @@
|
||||
#include <vector>
|
||||
|
||||
struct llama_model;
|
||||
class llama_batch_allocr;
|
||||
|
||||
class llama_io_read_i;
|
||||
class llama_io_write_i;
|
||||
@@ -102,8 +102,8 @@ struct llama_context {
|
||||
llama_memory_state_i * mstate,
|
||||
ggml_status & ret);
|
||||
|
||||
int encode(llama_batch & inp_batch);
|
||||
int decode(llama_batch & inp_batch);
|
||||
int encode(const llama_batch & batch_inp);
|
||||
int decode(const llama_batch & batch_inp);
|
||||
|
||||
//
|
||||
// state save/load
|
||||
@@ -181,7 +181,7 @@ private:
|
||||
|
||||
// Make sure enough space is available for outputs.
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
int32_t output_reserve(int32_t n_outputs);
|
||||
uint32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
//
|
||||
// graph
|
||||
@@ -246,8 +246,10 @@ private:
|
||||
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
||||
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
||||
|
||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||
int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
|
||||
// reuse the batch_allocr to avoid unnecessary memory allocations
|
||||
std::unique_ptr<llama_batch_allocr> batch_allocr;
|
||||
|
||||
uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||
|
||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "llama-cparams.h"
|
||||
|
||||
size_t llama_max_parallel_sequences(void) {
|
||||
return LLAMA_MAX_PARALLEL_SEQUENCES;
|
||||
return LLAMA_MAX_SEQ;
|
||||
}
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define LLAMA_MAX_PARALLEL_SEQUENCES 64
|
||||
#define LLAMA_MAX_SEQ 64
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
|
||||
+52
-60
@@ -139,6 +139,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
std::vector<uint64_t> sum(n_tokens, 0);
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -156,6 +157,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -180,6 +182,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
uint32_t * data = (uint32_t *) cls->data;
|
||||
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -210,6 +213,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
std::vector<int> last_pos(n_tokens, -1);
|
||||
std::vector<int> last_row(n_tokens, -1);
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -250,22 +254,6 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
const int64_t n_kv = kv_state->get_n_kv();
|
||||
|
||||
if (s_mask) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
|
||||
float * data = (float *) s_mask->data;
|
||||
|
||||
// clear unused states
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
data[i] = kv_state->s_mask(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
@@ -299,6 +287,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
const int32_t ti = s0*n_seq_tokens + i;
|
||||
float f = -INFINITY;
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
||||
if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
|
||||
if (hparams.use_alibi) {
|
||||
@@ -338,6 +327,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
const int32_t ti = s0*n_seq_tokens + i;
|
||||
float f = -INFINITY;
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
||||
if (ubatch->seq_id[s0][s] == seq_id) {
|
||||
if (hparams.use_alibi) {
|
||||
@@ -393,6 +383,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
for (int i = 0; i < n_enc; ++i) {
|
||||
float f = -INFINITY;
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[j][s];
|
||||
if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
|
||||
@@ -650,6 +641,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
{
|
||||
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
@@ -663,7 +655,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
{
|
||||
// Split into two equal parts
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
// TODO: these conts should not be needed
|
||||
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
@@ -986,23 +978,6 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
|
||||
|
||||
const auto n_kv = kv_state->get_n_kv();
|
||||
|
||||
auto & cur = inp->s_mask;
|
||||
|
||||
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
|
||||
auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
|
||||
|
||||
@@ -1455,43 +1430,53 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_copy_mask_state(
|
||||
ggml_tensor * llm_graph_context::build_recurrent_state(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) const {
|
||||
int32_t state_size,
|
||||
int32_t n_seqs,
|
||||
bool avoid_copies) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
|
||||
const auto n_kv = kv_state->get_n_kv();
|
||||
const auto kv_head = kv_state->get_head();
|
||||
const auto rs_zero = kv_state->get_rs_z();
|
||||
|
||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size());
|
||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_state->get_size());
|
||||
|
||||
// copy states
|
||||
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
|
||||
// this shrinks the tensors's ne[1] to n_kv
|
||||
states = ggml_get_rows(ctx0, states, state_copy);
|
||||
// Clear a single state which will then be copied to the other cleared states.
|
||||
// Note that this is a no-op when the view is zero-sized.
|
||||
ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
|
||||
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
||||
|
||||
// clear states of sequences which are starting at the beginning of this batch
|
||||
// FIXME: zero-out NANs?
|
||||
states = ggml_mul(ctx0, states, state_mask);
|
||||
ggml_tensor * output_states;
|
||||
|
||||
// copy states which won't be changed further (between n_seqs and n_kv)
|
||||
if (!avoid_copies) {
|
||||
// copy states
|
||||
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
|
||||
// {state_size, kv_size} -> {state_size, n_seqs}
|
||||
output_states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
|
||||
ggml_build_forward_expand(gf, output_states);
|
||||
} else {
|
||||
// FIXME: make the gathering operation happen before the copy below
|
||||
// (maybe with an optional lambda function passed as a parameter instead of `avoid_copies`?)
|
||||
output_states = states;
|
||||
}
|
||||
|
||||
// copy extra states which won't be changed further (between n_seqs and n_kv)
|
||||
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0,
|
||||
ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)),
|
||||
ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
|
||||
states_extra,
|
||||
ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
|
||||
|
||||
// the part of the states that will be used and modified
|
||||
return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
|
||||
return output_states;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
@@ -1502,8 +1487,8 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
|
||||
ggml_tensor * token_shift_all = kv_state->get_k_l(il);
|
||||
|
||||
ggml_tensor * token_shift = build_copy_mask_state(
|
||||
gf, token_shift_all, state_copy, state_mask,
|
||||
ggml_tensor * token_shift = build_recurrent_state(
|
||||
gf, token_shift_all, state_copy,
|
||||
hparams.n_embd_k_s(), n_seqs);
|
||||
|
||||
token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
|
||||
@@ -1578,23 +1563,30 @@ void llm_graph_context::build_pooling(
|
||||
ggml_tensor * inp_cls = build_inp_cls();
|
||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
|
||||
if (cls != nullptr && cls_b != nullptr) {
|
||||
if (cls) {
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
|
||||
cur = ggml_mul_mat(ctx0, cls, inp);
|
||||
if (cls_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_b);
|
||||
}
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
if (cls_out) {
|
||||
GGML_ASSERT(cls_out_b != nullptr);
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
}
|
||||
} else if (cls_out) {
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
GGML_ASSERT(cls_out_b != nullptr);
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
|
||||
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
||||
}
|
||||
|
||||
+7
-21
@@ -200,18 +200,6 @@ public:
|
||||
const llama_kv_cache_recurrent_state * kv_state;
|
||||
};
|
||||
|
||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
|
||||
virtual ~llm_graph_input_s_mask() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||
|
||||
const llama_kv_cache_recurrent_state * kv_state;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_cross_embd(
|
||||
@@ -390,7 +378,7 @@ struct llm_graph_params {
|
||||
const llama_memory_state_i * mstate;
|
||||
const llama_cross * cross;
|
||||
|
||||
int32_t n_outputs;
|
||||
uint32_t n_outputs;
|
||||
|
||||
const llm_graph_cb & cb;
|
||||
};
|
||||
@@ -424,8 +412,8 @@ struct llm_graph_context {
|
||||
const float norm_eps;
|
||||
const float norm_rms_eps;
|
||||
|
||||
const int32_t n_tokens;
|
||||
const int32_t n_outputs;
|
||||
const int64_t n_tokens;
|
||||
const int64_t n_outputs;
|
||||
const int32_t n_ctx_orig; // yarn
|
||||
|
||||
const enum llama_pooling_type pooling_type;
|
||||
@@ -521,7 +509,6 @@ struct llm_graph_context {
|
||||
ggml_tensor * build_inp_mean() const;
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
ggml_tensor * build_inp_s_copy() const;
|
||||
ggml_tensor * build_inp_s_mask() const;
|
||||
|
||||
ggml_tensor * build_inp_cross_embd() const;
|
||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||
@@ -606,18 +593,17 @@ struct llm_graph_context {
|
||||
// recurrent
|
||||
//
|
||||
|
||||
ggml_tensor * build_copy_mask_state(
|
||||
ggml_tensor * build_recurrent_state(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) const;
|
||||
int32_t state_size,
|
||||
int32_t n_seqs,
|
||||
bool avoid_copies = false) const;
|
||||
|
||||
ggml_tensor * build_rwkv_token_shift_load(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const;
|
||||
|
||||
|
||||
@@ -359,18 +359,16 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return result;
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
|
||||
GGML_UNUSED(embd_pooled);
|
||||
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
|
||||
llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_all) {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
llama_ubatch ubatch;
|
||||
|
||||
if (embd_pooled) {
|
||||
// Pooled embeddings cannot be split across ubatches (yet)
|
||||
if (embd_all) {
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = sbatch.split_seq(n_ubatch);
|
||||
} else {
|
||||
ubatch = sbatch.split_equal(n_ubatch);
|
||||
@@ -406,21 +404,12 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
|
||||
|
||||
bool success = true;
|
||||
|
||||
// TODO: here we have to verify that all ubatches can fit in the cells
|
||||
// however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells
|
||||
// during the compute of each ubatch. to reproduce, uncomment the following loop and run:
|
||||
//
|
||||
// $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8
|
||||
//
|
||||
// recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
|
||||
//
|
||||
GGML_UNUSED(ubatches);
|
||||
//for (const auto & ubatch : ubatches) {
|
||||
// if (!find_slot(ubatch)) {
|
||||
// success = false;
|
||||
// break;
|
||||
// }
|
||||
//}
|
||||
for (const auto & ubatch : ubatches) {
|
||||
if (!find_slot(ubatch)) {
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// restore the original state
|
||||
cells = std::move(org_cells);
|
||||
@@ -431,14 +420,13 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
|
||||
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (head > used + 2*n_tokens) {
|
||||
if (head > used + 2*n_seqs) {
|
||||
head = 0;
|
||||
}
|
||||
|
||||
@@ -534,16 +522,16 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
empty_cell.src = orig_cell.src;
|
||||
orig_cell.seq_id.erase(seq_id);
|
||||
empty_cell.seq_id.insert(seq_id); // will be overwritten
|
||||
GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
|
||||
}
|
||||
seq_meta.tail = next_empty_cell;
|
||||
// find next empty cell
|
||||
if (s + 1 < n_seqs) {
|
||||
next_empty_cell += 1;
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
next_empty_cell += 1;
|
||||
if (next_empty_cell >= size) { next_empty_cell -= size; }
|
||||
kv_cell & cell = cells[next_empty_cell];
|
||||
if (cell.is_empty()) { break; }
|
||||
next_empty_cell += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -553,8 +541,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
|
||||
// gather and re-order
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
int32_t dst_id = s + min;
|
||||
int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
|
||||
const int32_t dst_id = s + min;
|
||||
const int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
|
||||
if (dst_id != src_id) {
|
||||
kv_cell & dst_cell = cells[dst_id];
|
||||
kv_cell & src_cell = cells[src_id];
|
||||
@@ -563,12 +551,14 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
std::swap(dst_cell.src, src_cell.src);
|
||||
std::swap(dst_cell.seq_id, src_cell.seq_id);
|
||||
|
||||
// swap tails (assuming they NEVER overlap)
|
||||
for (const llama_seq_id seq_id : src_cell.seq_id) {
|
||||
cells[seq_id].tail = src_id;
|
||||
}
|
||||
for (const llama_seq_id seq_id : dst_cell.seq_id) {
|
||||
cells[seq_id].tail = dst_id;
|
||||
// swap tails
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
int32_t & tail = cells[i].tail;
|
||||
if (tail == src_id) {
|
||||
tail = dst_id;
|
||||
} else if (tail == dst_id) {
|
||||
tail = src_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -576,7 +566,7 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
// update the pos of the used seqs
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
int32_t cell_id = s + min;
|
||||
const int32_t cell_id = s + min;
|
||||
kv_cell & cell = cells[cell_id];
|
||||
|
||||
if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
|
||||
@@ -594,6 +584,38 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
// Find first cell without src refs, to use as the zero-ed state
|
||||
{
|
||||
// TODO: bake-in src refcounts in the cell metadata
|
||||
std::vector<int32_t> refcounts(size, 0);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const int32_t src = cells[i].src;
|
||||
if (src >= 0) {
|
||||
refcounts[src] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
rs_z = -1;
|
||||
for (int i = min; i <= max; ++i) {
|
||||
if (refcounts[i] == 0) {
|
||||
rs_z = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = min; i <= max; ++i) {
|
||||
if (cells[i].src < 0) {
|
||||
GGML_ASSERT(rs_z >= 0);
|
||||
cells[i].src0 = rs_z;
|
||||
} else {
|
||||
// Stage the source ids for all used cells to allow correct seq_* behavior
|
||||
// and still make these values available when setting the inputs
|
||||
cells[i].src0 = cells[i].src;
|
||||
}
|
||||
cells[i].src = i; // avoid moving or clearing twice
|
||||
}
|
||||
}
|
||||
|
||||
// allow getting the range of used cells, from head to head + n
|
||||
head = min;
|
||||
n = max - min + 1;
|
||||
@@ -605,47 +627,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::get_can_shift() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent::s_copy(int i) const {
|
||||
const uint32_t cell_id = i + head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
|
||||
|
||||
// prevent out-of-bound sources
|
||||
if (cell.src < 0 || (uint32_t) cell.src >= size) {
|
||||
cell.src = cell_id;
|
||||
}
|
||||
|
||||
int32_t res = cell.src;
|
||||
|
||||
// TODO: do not mutate the KV cache
|
||||
// ensure copy only happens once
|
||||
if (cell.src != (int32_t) cell_id) {
|
||||
cell.src = cell_id;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
float llama_kv_cache_recurrent::s_mask(int i) const {
|
||||
const uint32_t cell_id = i + head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
|
||||
|
||||
float res = (float) (cell.src >= 0);
|
||||
|
||||
// only clear once
|
||||
if (cell.src < 0) {
|
||||
cell.src = cell_id;
|
||||
}
|
||||
|
||||
return res;
|
||||
// shifting the pos is trivial for recurrent models
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t llama_kv_cache_recurrent::total_size() const {
|
||||
@@ -1111,6 +1094,10 @@ uint32_t llama_kv_cache_recurrent_state::get_head() const {
|
||||
return is_full ? 0 : kv->head;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent_state::get_rs_z() const {
|
||||
return is_full ? 0 : kv->rs_z;
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache_recurrent_state::get_size() const {
|
||||
return kv->size;
|
||||
}
|
||||
@@ -1124,9 +1111,5 @@ ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
|
||||
return kv->s_copy(i);
|
||||
}
|
||||
|
||||
float llama_kv_cache_recurrent_state::s_mask(int i) const {
|
||||
return kv->s_mask(i);
|
||||
return kv->cells[i + kv->head].src0;
|
||||
}
|
||||
|
||||
@@ -32,8 +32,7 @@ public:
|
||||
llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) override;
|
||||
bool embd_all) override;
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
@@ -57,10 +56,6 @@ public:
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||
int32_t s_copy(int i) const;
|
||||
float s_mask(int i) const;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
@@ -73,10 +68,14 @@ public:
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
// first zero-ed state
|
||||
int32_t rs_z = -1;
|
||||
|
||||
// TODO: optimize for recurrent state needs
|
||||
struct kv_cell {
|
||||
llama_pos pos = -1;
|
||||
int32_t src = -1; // used to copy states
|
||||
int32_t src = -1; // used to know where states should be copied from
|
||||
int32_t src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
|
||||
int32_t tail = -1;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
@@ -157,13 +156,13 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
uint32_t get_head() const;
|
||||
int32_t get_rs_z() const;
|
||||
uint32_t get_size() const;
|
||||
|
||||
ggml_tensor * get_k_l(int32_t il) const;
|
||||
ggml_tensor * get_v_l(int32_t il) const;
|
||||
|
||||
int32_t s_copy(int i) const;
|
||||
float s_mask(int i) const;
|
||||
|
||||
private:
|
||||
const llama_memory_status status;
|
||||
|
||||
@@ -95,36 +95,69 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return kv_swa->seq_pos_max(seq_id);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
|
||||
GGML_UNUSED(embd_pooled);
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_all) {
|
||||
GGML_UNUSED(embd_all);
|
||||
|
||||
// TODO: if we fail with split_simple, we should attempt different splitting strategies
|
||||
// first try simple split
|
||||
do {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
auto ubatch = sbatch.split_simple(n_ubatch);
|
||||
|
||||
ubatches.push_back(ubatch);
|
||||
}
|
||||
|
||||
auto heads_base = kv_base->prepare(ubatches);
|
||||
if (heads_base.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto heads_swa = kv_swa->prepare(ubatches);
|
||||
if (heads_swa.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
assert(heads_base.size() == heads_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
// if it fails, try equal split
|
||||
do {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
auto ubatch = sbatch.split_equal(n_ubatch);
|
||||
|
||||
ubatches.push_back(ubatch);
|
||||
}
|
||||
|
||||
auto heads_base = kv_base->prepare(ubatches);
|
||||
if (heads_base.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto heads_swa = kv_swa->prepare(ubatches);
|
||||
if (heads_swa.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
assert(heads_base.size() == heads_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
// TODO: if we fail again, we should attempt different splitting strategies
|
||||
// but to do that properly, we first have to refactor the batches to be more flexible
|
||||
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
auto ubatch = sbatch.split_simple(n_ubatch);
|
||||
|
||||
ubatches.push_back(ubatch);
|
||||
}
|
||||
|
||||
auto heads_base = kv_base->prepare(ubatches);
|
||||
if (heads_base.empty()) {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
auto heads_swa = kv_swa->prepare(ubatches);
|
||||
if (heads_swa.empty()) {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
assert(heads_base.size() == heads_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
|
||||
|
||||
@@ -34,8 +34,7 @@ public:
|
||||
llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) override;
|
||||
bool embd_all) override;
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
|
||||
+148
-85
@@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
|
||||
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::clear(bool data) {
|
||||
@@ -307,24 +310,27 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) {
|
||||
GGML_UNUSED(embd_pooled);
|
||||
bool embd_all) {
|
||||
GGML_UNUSED(embd_all);
|
||||
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
||||
do {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
while (sbatch.n_tokens > 0) {
|
||||
ubatches.push_back(sbatch.split_simple(n_ubatch));
|
||||
}
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
while (sbatch.n_tokens > 0) {
|
||||
ubatches.push_back(sbatch.split_simple(n_ubatch));
|
||||
}
|
||||
|
||||
auto heads = prepare(ubatches);
|
||||
if (heads.empty()) {
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
auto heads = prepare(ubatches);
|
||||
if (heads.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(
|
||||
this, std::move(sbatch), std::move(heads), std::move(ubatches));
|
||||
return std::make_unique<llama_kv_cache_unified_state>(
|
||||
this, std::move(sbatch), std::move(heads), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_full() {
|
||||
@@ -462,7 +468,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
assert(dinfo.ids[i] <= n_kv);
|
||||
|
||||
if (dinfo.ids[i] == n_kv) {
|
||||
if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -512,43 +518,68 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
head_cur = 0;
|
||||
}
|
||||
|
||||
// otherwise, one cell per token.
|
||||
|
||||
if (n_tokens > cells.size()) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
|
||||
return -1;
|
||||
}
|
||||
|
||||
//#define FIND_SLOT_DEBUG 1
|
||||
#if FIND_SLOT_DEBUG
|
||||
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
|
||||
|
||||
// for debugging
|
||||
{
|
||||
std::string ss;
|
||||
if (n_swa > 0) {
|
||||
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||
std::string ss;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (cells.is_empty(i)) {
|
||||
ss += '.';
|
||||
} else {
|
||||
ss += std::to_string(cells.seq_get(i));
|
||||
assert(cells.seq_count(i) >= 1);
|
||||
|
||||
if (cells.seq_count(i) == 1) {
|
||||
ss += std::to_string(cells.seq_get(i));
|
||||
} else {
|
||||
ss += 'M';
|
||||
}
|
||||
}
|
||||
if (i%256 == 255) {
|
||||
ss += " *";
|
||||
ss += '\n';
|
||||
}
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
if (cells.seq_pos_min(s) < 0) {
|
||||
continue;
|
||||
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||
std::string ss;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
std::string cur;
|
||||
if (cells.is_empty(i)) {
|
||||
cur = '.';
|
||||
} else {
|
||||
cur = std::to_string(cells.pos_get(i));
|
||||
}
|
||||
const int n = cur.size();
|
||||
for (int j = 0; j < 5 - n; ++j) {
|
||||
cur += ' ';
|
||||
}
|
||||
ss += cur;
|
||||
if (i%256 == 255) {
|
||||
ss += " *";
|
||||
}
|
||||
if (i%64 == 63) {
|
||||
ss += '\n';
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (cells.seq_pos_min(s) < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
@@ -559,21 +590,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep track of what the minimum sequence positions would be if we accept the ubatch
|
||||
llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
seq_pos_min[s] = cells.seq_pos_min(s);
|
||||
}
|
||||
|
||||
bool found = true;
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
const llama_pos pos = ubatch.pos[i];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
//const llama_pos pos = ubatch.pos[i];
|
||||
//const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
|
||||
// can we use this cell? either:
|
||||
// - the cell is empty
|
||||
// - the cell is occupied only by one sequence:
|
||||
// - mask causally, if the sequence is the same as the one we are inserting
|
||||
// - (disabled) mask causally, if the sequence is the same as the one we are inserting
|
||||
// - mask SWA, using current max pos for that sequence in the cache
|
||||
// always insert in the cell with minimum pos
|
||||
bool can_use = cells.is_empty(head_cur + i);
|
||||
@@ -581,21 +606,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
if (!can_use && cells.seq_count(head_cur + i) == 1) {
|
||||
const llama_pos pos_cell = cells.pos_get(head_cur + i);
|
||||
|
||||
// causal mask
|
||||
if (cells.seq_has(head_cur + i, seq_id)) {
|
||||
can_use = pos_cell >= pos;
|
||||
}
|
||||
// (disabled) causal mask
|
||||
// note: it's better to purge any "future" tokens beforehand
|
||||
//if (cells.seq_has(head_cur + i, seq_id)) {
|
||||
// can_use = pos_cell >= pos;
|
||||
//}
|
||||
|
||||
if (!can_use) {
|
||||
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
|
||||
|
||||
// SWA mask
|
||||
// note: we insert only in the cell with minimum pos in order to preserve the invariant that
|
||||
// all positions between [pos_min, pos_max] for each sequence will be present in the cache
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||
if (pos_cell == seq_pos_min[seq_id_cell] &&
|
||||
is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
seq_pos_min[seq_id_cell]++;
|
||||
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
can_use = true;
|
||||
}
|
||||
}
|
||||
@@ -623,18 +644,58 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||
if (!cells.is_empty(head_cur + i)) {
|
||||
cells.rm(head_cur + i);
|
||||
}
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: ubatch info:\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: n_tokens = %d, equal_seqs = %d\n", __func__, ubatch.n_tokens, ubatch.equal_seqs);
|
||||
LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d, n_seqs = %d\n", __func__, ubatch.n_seq_tokens, ubatch.n_seqs);
|
||||
}
|
||||
|
||||
cells.pos_set(head_cur + i, ubatch.pos[i]);
|
||||
// keep track of the max sequence position that we would overwrite with this ubatch
|
||||
// for non-SWA cache, this would be always empty
|
||||
llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
seq_pos_max_rm[s] = -1;
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
|
||||
cells.seq_add(head_cur + i, ubatch.seq_id[i][j]);
|
||||
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||
for (uint32_t j = 0; j < ubatch.n_seq_tokens; ++j) {
|
||||
const uint32_t idx = s*ubatch.n_seq_tokens + j;
|
||||
|
||||
if (!cells.is_empty(head_cur + idx)) {
|
||||
assert(cells.seq_count(head_cur + idx) == 1);
|
||||
|
||||
const llama_seq_id seq_id = cells.seq_get(head_cur + idx);
|
||||
const llama_pos pos = cells.pos_get(head_cur + idx);
|
||||
|
||||
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
|
||||
|
||||
cells.rm(head_cur + idx);
|
||||
}
|
||||
|
||||
cells.pos_set(head_cur + idx, ubatch.pos[idx]);
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) {
|
||||
cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
|
||||
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq_pos_max_rm[s] == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
|
||||
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
|
||||
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
|
||||
|
||||
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
|
||||
}
|
||||
}
|
||||
// move the head at the end of the slot
|
||||
head = head_cur + ubatch.n_tokens;
|
||||
}
|
||||
@@ -731,14 +792,14 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||
const int64_t n_seqs = ubatch->n_seqs;
|
||||
const uint32_t n_tokens = ubatch->n_tokens;
|
||||
const uint32_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||
const uint32_t n_seqs = ubatch->n_seqs;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
||||
float * data = (float *) dst->data;
|
||||
|
||||
const auto n_kv = dst->ne[0];
|
||||
const int64_t n_kv = dst->ne[0];
|
||||
|
||||
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
||||
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
||||
@@ -752,12 +813,14 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||
// xxxxx-----
|
||||
// xxxxx-----
|
||||
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
for (uint32_t h = 0; h < 1; ++h) {
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
for (int j = 0; j < n_seq_tokens; ++j) {
|
||||
const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
|
||||
for (uint32_t j = 0; j < n_seq_tokens; ++j) {
|
||||
const uint32_t idx = s*n_seq_tokens + j;
|
||||
|
||||
const llama_pos p1 = ubatch->pos[idx];
|
||||
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
float f = 0.0f;
|
||||
@@ -787,16 +850,16 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||
f = -INFINITY;
|
||||
}
|
||||
|
||||
data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
||||
data[h*(n_kv*n_tokens) + idx*n_kv + i] = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mask padded tokens
|
||||
if (data) {
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
||||
for (uint32_t j = n_tokens; j < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++j) {
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -944,11 +1007,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
//GGML_ASSERT(kv_self->size == n_ctx);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
||||
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
|
||||
ggml_set_input(inp->k_shift);
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
@@ -1449,9 +1510,11 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
seq_rm(dest_seq_id, -1, -1);
|
||||
|
||||
llama_sbatch sbatch;
|
||||
llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
||||
llama_ubatch ubatch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
||||
|
||||
batch.n_tokens = cell_count;
|
||||
ubatch.n_tokens = cell_count;
|
||||
ubatch.n_seq_tokens = cell_count;
|
||||
ubatch.n_seqs = 1;
|
||||
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
llama_pos pos;
|
||||
@@ -1471,18 +1534,18 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
}
|
||||
|
||||
batch.pos[i] = pos;
|
||||
batch.n_seq_id[i] = n_seq_id;
|
||||
batch.seq_id[i] = &dest_seq_id;
|
||||
ubatch.pos[i] = pos;
|
||||
ubatch.n_seq_id[i] = n_seq_id;
|
||||
ubatch.seq_id[i] = &dest_seq_id;
|
||||
}
|
||||
|
||||
const auto head_cur = find_slot(batch);
|
||||
const auto head_cur = find_slot(ubatch);
|
||||
if (head_cur < 0) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
apply_ubatch(head_cur, batch);
|
||||
apply_ubatch(head_cur, ubatch);
|
||||
|
||||
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
||||
head = head_cur;
|
||||
@@ -1490,8 +1553,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||
// Assume that this is one contiguous block of cells
|
||||
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
||||
GGML_ASSERT(cells.pos_get(head_cur) == batch.pos[0]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == batch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
||||
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
||||
} else {
|
||||
@@ -1676,7 +1739,7 @@ llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_context * lctx,
|
||||
bool do_shift,
|
||||
defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
|
||||
if (!do_shift && dinfo.empty()) {
|
||||
if (!do_shift && this->dinfo.empty()) {
|
||||
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,8 +59,7 @@ public:
|
||||
llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) override;
|
||||
bool embd_all) override;
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
@@ -158,6 +157,8 @@ private:
|
||||
// SWA
|
||||
const uint32_t n_swa = 0;
|
||||
|
||||
int debug = 0;
|
||||
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
+18
-13
@@ -23,7 +23,7 @@ public:
|
||||
|
||||
used.clear();
|
||||
|
||||
for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
seq_pos[s].clear();
|
||||
}
|
||||
}
|
||||
@@ -80,6 +80,9 @@ public:
|
||||
assert(isrc < pos.size());
|
||||
assert(idst < pos.size());
|
||||
|
||||
assert(pos[idst] == -1);
|
||||
assert(pos[isrc] != -1);
|
||||
|
||||
pos [idst] = pos [isrc];
|
||||
shift[idst] = shift[isrc];
|
||||
seq [idst] = seq [isrc];
|
||||
@@ -144,9 +147,10 @@ public:
|
||||
assert(pos[i] != -1);
|
||||
|
||||
seq_pos_rm(i);
|
||||
seq[i].reset();
|
||||
|
||||
pos[i] = -1;
|
||||
seq[i].reset();
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
}
|
||||
@@ -164,6 +168,7 @@ public:
|
||||
|
||||
if (seq[i].none()) {
|
||||
pos[i] = -1;
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
@@ -192,6 +197,7 @@ public:
|
||||
seq[i].reset();
|
||||
|
||||
pos[i] = -1;
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
@@ -234,7 +240,7 @@ public:
|
||||
llama_seq_id seq_get(uint32_t i) const {
|
||||
assert(seq[i].count() == 1);
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
return s;
|
||||
}
|
||||
@@ -247,7 +253,7 @@ public:
|
||||
// return -1 if the sequence is not present
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const {
|
||||
assert(seq_id >= 0);
|
||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
assert(seq_id < LLAMA_MAX_SEQ);
|
||||
|
||||
if (seq_pos[seq_id].empty()) {
|
||||
return -1;
|
||||
@@ -260,7 +266,7 @@ public:
|
||||
// return -1 if the sequence is not present
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const {
|
||||
assert(seq_id >= 0);
|
||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
assert(seq_id < LLAMA_MAX_SEQ);
|
||||
|
||||
if (seq_pos[seq_id].empty()) {
|
||||
return -1;
|
||||
@@ -317,21 +323,20 @@ public:
|
||||
pos[i] += d;
|
||||
shift[i] += d;
|
||||
|
||||
seq_pos_add(i);
|
||||
|
||||
has_shift = true;
|
||||
|
||||
if (pos[i] < 0) {
|
||||
seq_pos_rm(i);
|
||||
|
||||
seq[i].reset();
|
||||
pos[i] = -1;
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
seq_pos_add(i);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -379,20 +384,20 @@ private:
|
||||
//
|
||||
std::vector<llama_pos> shift;
|
||||
|
||||
using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
|
||||
using bits_t = std::bitset<LLAMA_MAX_SEQ>;
|
||||
|
||||
// the bitset seq[i] tells us which sequences are currently occupying the i-th cell
|
||||
std::vector<bits_t> seq;
|
||||
|
||||
// the set seq_pos[s] tells us which positions are currently present for sequence s
|
||||
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
|
||||
std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
|
||||
|
||||
// helper functions for updating `seq_pos`, once cell at a time:
|
||||
|
||||
// remove cell i
|
||||
void seq_pos_rm(uint32_t i) {
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
seq_pos[s].erase(pos[i]);
|
||||
}
|
||||
@@ -401,7 +406,7 @@ private:
|
||||
|
||||
// add cell i
|
||||
void seq_pos_add(uint32_t i) {
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
seq_pos[s].insert(pos[i]);
|
||||
}
|
||||
|
||||
+1
-2
@@ -73,8 +73,7 @@ struct llama_memory_i {
|
||||
virtual llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) = 0;
|
||||
bool embd_all) = 0;
|
||||
|
||||
// simulate full cache, used for allocating worst-case compute buffers
|
||||
virtual llama_memory_state_ptr init_full() = 0;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user