mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-30 01:27:42 +02:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e6e934c5ea | |||
| b536eb0233 | |||
| e0c93af2a0 | |||
| 423bee462b | |||
| 8abcc70a74 | |||
| eaba92c3dc | |||
| 6ab881b7c3 | |||
| d838c22bb3 | |||
| 25f40ca65f | |||
| 015deb9048 | |||
| 2ceda3f662 | |||
| 44008ce8f9 | |||
| 6a9bf2f788 | |||
| faa1bc26ee | |||
| 32b17abdb0 | |||
| 8bece2eb20 | |||
| a6fd8ca1fe | |||
| c55bce4159 | |||
| 1f1e57f2bf | |||
| e9a859db3c | |||
| 41e3f02647 | |||
| 1efb5f7ae1 |
@@ -293,6 +293,7 @@ jobs:
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
@@ -303,6 +304,7 @@ jobs:
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
|
||||
build_type: [RelWithDebInfo]
|
||||
include:
|
||||
- build_type: Release
|
||||
@@ -45,7 +45,7 @@ jobs:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
@@ -72,7 +72,15 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON \
|
||||
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
|
||||
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
@@ -88,7 +96,7 @@ jobs:
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) && matrix.build_type == 'Release' }}
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
@@ -164,29 +164,6 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
||||
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_SANITIZE_THREAD)
|
||||
message(STATUS "Using -fsanitize=thread")
|
||||
|
||||
add_compile_options(-fsanitize=thread)
|
||||
link_libraries (-fsanitize=thread)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_ADDRESS)
|
||||
message(STATUS "Using -fsanitize=address")
|
||||
|
||||
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
link_libraries (-fsanitize=address)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_UNDEFINED)
|
||||
message(STATUS "Using -fsanitize=undefined")
|
||||
|
||||
add_compile_options(-fsanitize=undefined)
|
||||
link_libraries (-fsanitize=undefined)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include("cmake/license.cmake")
|
||||
license_add_file("llama.cpp" "LICENSE")
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
/examples/batched.swift/ @ggerganov
|
||||
/examples/batched/ @ggerganov
|
||||
/examples/convert-llama2c-to-ggml/ @ggerganov
|
||||
/examples/debug/ @danbev @pwilkin
|
||||
/examples/deprecation-warning/ @ggerganov
|
||||
/examples/diffusion/ @am17an
|
||||
/examples/embedding/ @ggerganov
|
||||
|
||||
@@ -32,4 +32,27 @@ function(llama_add_compile_flags)
|
||||
set(CXX_FLAGS "" PARENT_SCOPE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_SANITIZE_THREAD)
|
||||
message(STATUS "Using -fsanitize=thread")
|
||||
|
||||
add_compile_options(-fsanitize=thread)
|
||||
link_libraries (-fsanitize=thread)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_ADDRESS)
|
||||
message(STATUS "Using -fsanitize=address")
|
||||
|
||||
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
link_libraries (-fsanitize=address)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_UNDEFINED)
|
||||
message(STATUS "Using -fsanitize=undefined")
|
||||
|
||||
add_compile_options(-fsanitize=undefined)
|
||||
link_libraries (-fsanitize=undefined)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
+18
-16
@@ -45,6 +45,8 @@ static float common_ggml_get_float_value(const uint8_t * data,
|
||||
return v;
|
||||
}
|
||||
|
||||
#define INDENT " "
|
||||
|
||||
template <bool abort>
|
||||
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
GGML_ASSERT(n > 0);
|
||||
@@ -60,41 +62,41 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
|
||||
}
|
||||
}
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
LOG_ERR(" [\n");
|
||||
LOG(INDENT "[\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2 * n) {
|
||||
LOG_ERR(" ..., \n");
|
||||
LOG(INDENT INDENT "..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
LOG_ERR(" [\n");
|
||||
LOG(INDENT INDENT "[\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2 * n) {
|
||||
LOG_ERR(" ..., \n");
|
||||
LOG(INDENT INDENT INDENT "..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
LOG_ERR(" [");
|
||||
LOG(INDENT INDENT INDENT "[");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2 * n) {
|
||||
LOG_ERR("..., ");
|
||||
LOG(" ..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
|
||||
LOG_ERR("%12.4f", v);
|
||||
LOG("%12.4f", v);
|
||||
if (i0 < ne[0] - 1) {
|
||||
LOG_ERR(", ");
|
||||
LOG(", ");
|
||||
}
|
||||
}
|
||||
LOG_ERR("],\n");
|
||||
LOG(" ],\n");
|
||||
}
|
||||
LOG_ERR(" ],\n");
|
||||
LOG(INDENT INDENT "],\n");
|
||||
}
|
||||
LOG_ERR(" ]\n");
|
||||
LOG_ERR(" sum = %f\n", sum);
|
||||
LOG(INDENT "]\n");
|
||||
LOG(INDENT "sum = %f\n", sum);
|
||||
}
|
||||
|
||||
if constexpr (abort) {
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
LOG("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
@@ -137,9 +139,9 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
|
||||
}
|
||||
|
||||
if (matches_filter) {
|
||||
LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
|
||||
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
|
||||
common_ggml_ne_string(t).c_str());
|
||||
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
|
||||
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
|
||||
common_ggml_ne_string(t).c_str());
|
||||
}
|
||||
|
||||
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||
|
||||
+3
-12
@@ -47,21 +47,15 @@ static std::string common_tokens_to_str(const llama_tokens & inp, size_t start,
|
||||
* @return Vector of draft tokens, empty if no matching pattern is found
|
||||
*/
|
||||
llama_tokens common_ngram_simple_draft(
|
||||
common_ngram_simple_state & state,
|
||||
const common_ngram_simple_config & config,
|
||||
const llama_tokens & tokens, llama_token sampled) {
|
||||
|
||||
// Simple implementation of self-speculative decoding without a draft model.
|
||||
//
|
||||
const size_t cur_len = tokens.size();
|
||||
// Only check every check_rate tokens to save compute
|
||||
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
|
||||
if (state.idx_last_check + state.config.check_rate > cur_len) {
|
||||
llama_tokens draft_tokens;
|
||||
return draft_tokens;
|
||||
}
|
||||
|
||||
size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history
|
||||
size_t n_draft_max = state.config.size_mgram; // the m-gram following the found n-gram is used for draft
|
||||
const size_t n_draft_min = config.size_ngram; // size of n-gram to lookup in token history
|
||||
const size_t n_draft_max = config.size_mgram; // the m-gram following the found n-gram is used for draft
|
||||
|
||||
// vector for tokens we want to verify.
|
||||
// return empty vector if there is no match.
|
||||
@@ -80,9 +74,6 @@ llama_tokens common_ngram_simple_draft(
|
||||
}
|
||||
pattern.push_back(sampled); // add the last token to the pattern
|
||||
|
||||
// We do a search in the token history.
|
||||
state.idx_last_check = cur_len;
|
||||
|
||||
size_t match_pos = 0; // we ignore position 0, position 0 == no match
|
||||
// search backwards, but skip the current match (we are currently there)
|
||||
for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) {
|
||||
|
||||
+1
-15
@@ -27,23 +27,9 @@ struct common_ngram_simple_config {
|
||||
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
|
||||
};
|
||||
|
||||
// current state (and config) of n-gram simple.
|
||||
struct common_ngram_simple_state {
|
||||
common_ngram_simple_config config;
|
||||
|
||||
size_t idx_last_check = 0; // index of last check in context history (mutable)
|
||||
|
||||
common_ngram_simple_state(const common_ngram_simple_config & config)
|
||||
: config(config) {}
|
||||
};
|
||||
|
||||
// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
|
||||
// state: the ngram simple state to search in.
|
||||
// inp: the tokens generated so far.
|
||||
// sampled: the token that was just sampled.
|
||||
// draft: vector to store the draft tokens, initially empty.
|
||||
llama_tokens common_ngram_simple_draft(
|
||||
common_ngram_simple_state & state,
|
||||
const common_ngram_simple_config & config,
|
||||
const llama_tokens & tokens, llama_token sampled);
|
||||
|
||||
|
||||
|
||||
+14
-6
@@ -463,12 +463,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
|
||||
|
||||
// state of self-speculation (simple implementation, not ngram-map)
|
||||
struct common_speculative_state_ngram_simple : public common_speculative_state {
|
||||
common_ngram_simple_state state;
|
||||
common_ngram_simple_config config;
|
||||
|
||||
uint16_t check_id = 0; // used to control the frequency of generating drafts
|
||||
|
||||
common_speculative_state_ngram_simple(
|
||||
enum common_speculative_type type,
|
||||
common_ngram_simple_state state)
|
||||
: common_speculative_state(type), state(state) {}
|
||||
common_ngram_simple_config config)
|
||||
: common_speculative_state(type), config(config) {}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
GGML_UNUSED(prompt);
|
||||
@@ -479,7 +481,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last,
|
||||
llama_tokens & result) override {
|
||||
result = common_ngram_simple_draft(state, prompt_tgt, id_last);
|
||||
++check_id;
|
||||
if (check_id < config.check_rate) {
|
||||
return;
|
||||
}
|
||||
check_id = 0;
|
||||
|
||||
result = common_ngram_simple_draft(config, prompt_tgt, id_last);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
@@ -889,14 +897,14 @@ common_speculative * common_speculative_init(
|
||||
uint16_t mgram_size_value = ngram_map.size_value;
|
||||
uint16_t check_rate = ngram_map.check_rate;
|
||||
|
||||
auto config_simple = common_ngram_simple_config{
|
||||
auto config_simple = common_ngram_simple_config {
|
||||
/* .size_ngram = */ ngram_size_key,
|
||||
/* .size_mgram = */ mgram_size_value,
|
||||
/* .check_rate = */ check_rate
|
||||
};
|
||||
auto state = std::make_unique<common_speculative_state_ngram_simple>(
|
||||
/* .type = */ config.type,
|
||||
/* .state = */ common_ngram_simple_state(config_simple)
|
||||
/* .state = */ config_simple
|
||||
);
|
||||
impls.push_back(std::move(state));
|
||||
break;
|
||||
|
||||
+1
-3
@@ -252,9 +252,7 @@ CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.ggu
|
||||
|
||||
The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/environment-variables.html#cuda-scale-launch-queues) controls the size of CUDA's command buffer, which determines how many GPU operations can be queued before the CPU must wait for the GPU to catch up. A larger buffer reduces CPU-side stalls and allows more work to be queued on a GPU.
|
||||
|
||||
**Default behavior:** llama.cpp automatically sets `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
|
||||
|
||||
See PR [#19042](https://github.com/ggml-org/llama.cpp/pull/19042) for performance benchmarks and technical details.
|
||||
Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
|
||||
|
||||
### Unified Memory
|
||||
|
||||
|
||||
+159
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from safetensors import safe_open
|
||||
|
||||
|
||||
MODEL_SAFETENSORS_FILE = "model.safetensors"
|
||||
MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
|
||||
|
||||
|
||||
def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
|
||||
index_file = model_path / MODEL_SAFETENSORS_INDEX
|
||||
|
||||
if index_file.exists():
|
||||
with open(index_file, 'r') as f:
|
||||
index = json.load(f)
|
||||
return index.get("weight_map", {})
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_all_tensor_names(model_path: Path) -> list[str]:
|
||||
weight_map = get_weight_map(model_path)
|
||||
|
||||
if weight_map is not None:
|
||||
return list(weight_map.keys())
|
||||
|
||||
single_file = model_path / MODEL_SAFETENSORS_FILE
|
||||
if single_file.exists():
|
||||
try:
|
||||
with safe_open(single_file, framework="pt", device="cpu") as f:
|
||||
return list(f.keys())
|
||||
except Exception as e:
|
||||
print(f"Error reading {single_file}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Error: No safetensors files found in {model_path}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
|
||||
weight_map = get_weight_map(model_path)
|
||||
|
||||
if weight_map is not None:
|
||||
return weight_map.get(tensor_name)
|
||||
|
||||
single_file = model_path / MODEL_SAFETENSORS_FILE
|
||||
if single_file.exists():
|
||||
return single_file.name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalize_tensor_name(tensor_name: str) -> str:
|
||||
normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
|
||||
normalized = re.sub(r'\.\d+$', '.#', normalized)
|
||||
return normalized
|
||||
|
||||
|
||||
def list_all_tensors(model_path: Path, unique: bool = False):
|
||||
tensor_names = get_all_tensor_names(model_path)
|
||||
|
||||
if unique:
|
||||
seen = set()
|
||||
for tensor_name in sorted(tensor_names):
|
||||
normalized = normalize_tensor_name(tensor_name)
|
||||
if normalized not in seen:
|
||||
seen.add(normalized)
|
||||
print(normalized)
|
||||
else:
|
||||
for tensor_name in sorted(tensor_names):
|
||||
print(tensor_name)
|
||||
|
||||
|
||||
def print_tensor_info(model_path: Path, tensor_name: str):
|
||||
tensor_file = find_tensor_file(model_path, tensor_name)
|
||||
|
||||
if tensor_file is None:
|
||||
print(f"Error: Could not find tensor '{tensor_name}' in model index")
|
||||
print(f"Model path: {model_path}")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = model_path / tensor_file
|
||||
|
||||
try:
|
||||
with safe_open(file_path, framework="pt", device="cpu") as f:
|
||||
if tensor_name in f.keys():
|
||||
tensor_slice = f.get_slice(tensor_name)
|
||||
shape = tensor_slice.get_shape()
|
||||
print(f"Tensor: {tensor_name}")
|
||||
print(f"File: {tensor_file}")
|
||||
print(f"Shape: {shape}")
|
||||
else:
|
||||
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
|
||||
sys.exit(1)
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{file_path}' was not found.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Print tensor information from a safetensors model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"tensor_name",
|
||||
nargs="?", # optional (if --list is used for example)
|
||||
help="Name of the tensor to inspect"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m", "--model-path",
|
||||
type=Path,
|
||||
help="Path to the model directory (default: MODEL_PATH environment variable)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l", "--list",
|
||||
action="store_true",
|
||||
help="List unique tensor patterns in the model (layer numbers replaced with #)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
model_path = args.model_path
|
||||
if model_path is None:
|
||||
model_path_str = os.environ.get("MODEL_PATH")
|
||||
if model_path_str is None:
|
||||
print("Error: --model-path not provided and MODEL_PATH environment variable not set")
|
||||
sys.exit(1)
|
||||
model_path = Path(model_path_str)
|
||||
|
||||
if not model_path.exists():
|
||||
print(f"Error: Model path does not exist: {model_path}")
|
||||
sys.exit(1)
|
||||
|
||||
if not model_path.is_dir():
|
||||
print(f"Error: Model path is not a directory: {model_path}")
|
||||
sys.exit(1)
|
||||
|
||||
if args.list:
|
||||
list_all_tensors(model_path, unique=True)
|
||||
else:
|
||||
if args.tensor_name is None:
|
||||
print("Error: tensor_name is required when not using --list")
|
||||
sys.exit(1)
|
||||
print_tensor_info(model_path, args.tensor_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -7,8 +7,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_virtgpu_reg();
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -268,9 +268,9 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
|
||||
_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
|
||||
}
|
||||
|
||||
static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
|
||||
return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
|
||||
_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
|
||||
static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const uint8_t x1, const float y1) {
|
||||
return _mm256_set_m128(_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
|
||||
_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
@@ -782,6 +782,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
|
||||
@@ -795,10 +796,10 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
||||
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
||||
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
|
||||
_mm256_cvtepi32_ps(p_1), accum1);
|
||||
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
|
||||
_mm256_cvtepi32_ps(p_2), accum2);
|
||||
const __m256 scale0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 0].e));
|
||||
const __m256 scale1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 1].e));
|
||||
accum1 = _mm256_fmadd_ps(scale0, _mm256_cvtepi32_ps(p_1), accum1);
|
||||
accum2 = _mm256_fmadd_ps(scale1, _mm256_cvtepi32_ps(p_2), accum2);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
@@ -830,7 +831,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib].e);
|
||||
int sumi1 = 0;
|
||||
int sumi2 = 0;
|
||||
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
||||
@@ -3817,4 +3818,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -75,6 +75,9 @@
|
||||
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
|
||||
float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// precomputed f32 table for e8m0 half (1 KB) (simd-mappings.h)
|
||||
float ggml_table_f32_e8m0_half[1 << 8];
|
||||
|
||||
#if defined(__ARM_ARCH)
|
||||
struct ggml_arm_arch_features_type {
|
||||
int sve_cnt;
|
||||
@@ -3681,6 +3684,11 @@ void ggml_cpu_init(void) {
|
||||
ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
||||
}
|
||||
|
||||
// initialize E8M0 half table (256 entries)
|
||||
for (int i = 0; i < (1 << 8); ++i) {
|
||||
ggml_table_f32_e8m0_half[i] = GGML_E8M0_TO_FP32_HALF(i);
|
||||
}
|
||||
|
||||
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
||||
|
||||
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
|
||||
|
||||
@@ -116,6 +116,17 @@ extern "C" {
|
||||
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// precomputed f32 table for e8m0 half (1 KB)
|
||||
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
|
||||
extern float ggml_table_f32_e8m0_half[1 << 8];
|
||||
|
||||
// Use lookup table for E8M0 on x86 (faster than bit manipulation)
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
|
||||
#else
|
||||
#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
|
||||
#endif
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
|
||||
@@ -2279,13 +2279,19 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (ne2 == 1) {
|
||||
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
|
||||
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
if (ne2 <= 4) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
|
||||
@@ -5049,16 +5055,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
// Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
|
||||
// PR: https://github.com/ggml-org/llama.cpp/pull/19042
|
||||
if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
|
||||
#ifdef _WIN32
|
||||
_putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
|
||||
#else
|
||||
setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
|
||||
@@ -3697,13 +3697,20 @@ static __global__ void mul_mat_q(
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
}
|
||||
|
||||
|
||||
template <ggml_type type, int mmq_x, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
|
||||
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
const int32_t * expert_bounds,
|
||||
float * __restrict__ dst,
|
||||
const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x,
|
||||
const int nrows_x,
|
||||
const int ncols_dst,
|
||||
const size_t stride_col_dst,
|
||||
const int nchannels_y,
|
||||
const size_t stride_channel_dst,
|
||||
const int nsamples_y,
|
||||
const size_t stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
|
||||
+127
-67
@@ -4,26 +4,48 @@
|
||||
#include "mmvf.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false>
|
||||
template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false, bool is_multi_token_id = false>
|
||||
static __global__ void mul_mat_vec_f(
|
||||
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
|
||||
const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
|
||||
const int ncols2, const uint3 nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
|
||||
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const int ids_stride) {
|
||||
const int row = blockIdx.x;
|
||||
// for MUL_MAT_ID - blockIdx.y = n_expert_used, blockIdx.z = ncols_dst (tokens)
|
||||
const int channel_dst = blockIdx.y;
|
||||
const int channel_x = ids ? ids[channel_dst] : fastdiv((uint32_t) channel_dst, channel_ratio);
|
||||
const int channel_y = ids ? channel_dst % nchannels_y : channel_dst;
|
||||
const int sample_dst = blockIdx.z;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
int token_idx;
|
||||
int channel_x;
|
||||
int channel_y;
|
||||
int sample_dst;
|
||||
|
||||
if constexpr (is_multi_token_id) {
|
||||
// Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
|
||||
token_idx = blockIdx.z;
|
||||
channel_x = ids[channel_dst + token_idx * ids_stride];
|
||||
channel_y = fastmodulo(channel_dst, nchannels_y);
|
||||
sample_dst = 0;
|
||||
} else {
|
||||
token_idx = ids ? blockIdx.z : 0;
|
||||
channel_x = ids ? ids[blockIdx.y + token_idx * ids_stride] : fastdiv((uint32_t) channel_dst, channel_ratio);
|
||||
channel_y = ids ? fastmodulo(blockIdx.y, nchannels_y) : channel_dst;
|
||||
sample_dst = ids ? 0 : blockIdx.z;
|
||||
}
|
||||
|
||||
const int sample_x = fastdiv((uint32_t) sample_dst, sample_ratio);
|
||||
const int sample_y = sample_dst;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
|
||||
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
|
||||
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
|
||||
if constexpr (is_multi_token_id) {
|
||||
y += token_idx*stride_col_y2*2;
|
||||
dst += token_idx*stride_col_dst;
|
||||
}
|
||||
|
||||
bool use_gate = false;
|
||||
bool use_bias = false;
|
||||
@@ -56,8 +78,10 @@ static __global__ void mul_mat_vec_f(
|
||||
if (use_gate) {
|
||||
gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
|
||||
}
|
||||
|
||||
const int channel_bias = ids ? channel_x : channel_dst;
|
||||
|
||||
if constexpr (has_fusion) {
|
||||
const int channel_bias = ids ? channel_x : channel_dst;
|
||||
if (use_bias) {
|
||||
x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst;
|
||||
}
|
||||
@@ -349,36 +373,36 @@ static __global__ void mul_mat_vec_f(
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename type_acc, int ncols_dst, int block_size>
|
||||
template<typename T, typename type_acc, int ncols_dst, int block_size, bool is_multi_token_id = false>
|
||||
static void mul_mat_vec_f_switch_fusion(
|
||||
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
|
||||
const int64_t ncols, const int64_t nrows,
|
||||
const int64_t ncols, const uint3 nchannels_y,
|
||||
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
|
||||
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const cudaStream_t stream) {
|
||||
const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) {
|
||||
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
if constexpr (ncols_dst == 1) {
|
||||
if (has_fusion) {
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
|
||||
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, block_size><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, false, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
|
||||
}
|
||||
|
||||
template <typename T, typename type_acc, int ncols_dst>
|
||||
template <typename T, typename type_acc, int ncols_dst, bool is_multi_token_id = false>
|
||||
void launch_mul_mat_vec_f_cuda(
|
||||
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
|
||||
const int64_t ncols, const int64_t nrows,
|
||||
@@ -386,12 +410,13 @@ void launch_mul_mat_vec_f_cuda(
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
const int64_t nsamples_or_ntokens, const int64_t ids_stride, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
GGML_ASSERT(stride_row % 2 == 0);
|
||||
GGML_ASSERT(stride_col_y % 2 == 0);
|
||||
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
|
||||
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
|
||||
const uint3 nchannels_y_fd = ids ? init_fastdiv_values(nchannels_y) : make_uint3(0, 0, 0);
|
||||
const uint3 channel_ratio_fd = ids ? make_uint3(0, 0, 0) : init_fastdiv_values(nchannels_dst / nchannels_x);
|
||||
const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x);
|
||||
|
||||
@@ -415,56 +440,56 @@ void launch_mul_mat_vec_f_cuda(
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
|
||||
const int nbytes_shared = warp_size*sizeof(float) + (has_fusion ? warp_size*sizeof(float) : 0);
|
||||
const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
|
||||
const dim3 block_nums(nrows, nchannels_dst, nsamples_or_ntokens);
|
||||
const dim3 block_dims(block_size_best, 1, 1);
|
||||
switch (block_size_best) {
|
||||
case 32: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 32>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 32, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
case 64: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 64>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 64, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
case 96: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 96>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 96, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
case 128: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 128>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 128, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
case 160: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 160>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 160, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
case 192: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 192>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 192, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
case 224: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 224>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 224, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
case 256: {
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 256>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
|
||||
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 256, is_multi_token_id>
|
||||
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -480,55 +505,88 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
const int64_t ids_stride, cudaStream_t stream) {
|
||||
|
||||
const bool has_ids = ids != nullptr;
|
||||
|
||||
if (has_ids && ncols_dst > 1) {
|
||||
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
|
||||
constexpr int c_ncols_dst = 1;
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, c_ncols_dst, true>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
ncols_dst, ids_stride, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
if (has_ids) {
|
||||
// Single-token MUL_MAT_ID path
|
||||
constexpr int c_ncols_dst = 1;
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, c_ncols_dst>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
ncols_dst, ids_stride, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 1>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
case 2:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 2>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
case 3:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 3>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
case 4:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 4>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
case 5:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 5>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
case 6:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 6>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
case 7:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 7>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
case 8:
|
||||
launch_mul_mat_vec_f_cuda<T, type_acc, 8>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
nsamples_dst, ids_stride, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -544,21 +602,21 @@ static void mul_mat_vec_f_cuda(
|
||||
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
|
||||
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
|
||||
enum ggml_prec prec, cudaStream_t stream) {
|
||||
const int64_t ids_stride, enum ggml_prec prec, cudaStream_t stream) {
|
||||
|
||||
if constexpr(std::is_same_v<T, half>) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
return;
|
||||
}
|
||||
}
|
||||
mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
|
||||
(x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
|
||||
@@ -573,7 +631,7 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
||||
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
|
||||
GGML_ASSERT(!ids || ne12 <= MMVF_MAX_BATCH_SIZE);
|
||||
GGML_ASSERT(ne13 == ne3);
|
||||
|
||||
GGML_ASSERT( nb00 == ts_src0);
|
||||
@@ -626,29 +684,31 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
const int64_t ncols_dst = ids ? ne2 : ne1;
|
||||
const int64_t nchannels_y = ids ? ne11 : ne12;
|
||||
const int64_t nchannels_dst = ids ? ne1 : ne2;
|
||||
const int64_t stride_col_dst = ids ? s2 : s1;
|
||||
const int64_t stride_col_y = ids ? s12 : s11;
|
||||
const int64_t stride_channel_dst = ids ? s1 : s2;
|
||||
const int64_t stride_channel_y = ids ? s11 : s12;
|
||||
|
||||
GGML_ASSERT(!ids || ncols_dst == 1);
|
||||
const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half * src0_d = (const half *) src0->data;
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream());
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
|
||||
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, prec, ctx.stream());
|
||||
ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream());
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||
@@ -695,19 +755,19 @@ void ggml_cuda_op_mul_mat_vec_f(
|
||||
const float * src0_d = (const float *) src0_dd_i;
|
||||
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream);
|
||||
} break;
|
||||
case GGML_TYPE_F16: {
|
||||
const half * src0_d = (const half *) src0_dd_i;
|
||||
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream);
|
||||
} break;
|
||||
case GGML_TYPE_BF16: {
|
||||
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
|
||||
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define MMVF_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVF kernels.
|
||||
|
||||
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
|
||||
const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
|
||||
|
||||
|
||||
+85
-50
@@ -137,15 +137,15 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
|
||||
return 1;
|
||||
}
|
||||
|
||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||
template <ggml_type type, int ncols_dst, bool has_fusion>
|
||||
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
|
||||
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
||||
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
|
||||
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
|
||||
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) {
|
||||
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
|
||||
const uint32_t ids_stride) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
@@ -162,11 +162,25 @@ static __global__ void mul_mat_vec_q(
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
|
||||
|
||||
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
|
||||
const uint32_t channel_dst = blockIdx.y;
|
||||
const uint32_t channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
|
||||
const uint32_t channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
|
||||
const uint32_t sample_dst = blockIdx.z;
|
||||
|
||||
uint32_t token_idx = 0;
|
||||
uint32_t channel_x;
|
||||
uint32_t channel_y;
|
||||
uint32_t sample_dst;
|
||||
|
||||
if constexpr (is_multi_token_id) {
|
||||
// Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
|
||||
token_idx = blockIdx.z;
|
||||
channel_x = ids[channel_dst + token_idx * ids_stride];
|
||||
channel_y = fastmodulo(channel_dst, nchannels_y);
|
||||
sample_dst = 0;
|
||||
} else {
|
||||
channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
|
||||
channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
|
||||
sample_dst = blockIdx.z;
|
||||
}
|
||||
|
||||
const uint32_t sample_x = fastdiv(sample_dst, sample_ratio);
|
||||
const uint32_t sample_y = sample_dst;
|
||||
|
||||
@@ -188,11 +202,11 @@ static __global__ void mul_mat_vec_q(
|
||||
active_glu = fusion.glu_op;
|
||||
}
|
||||
|
||||
const uint32_t channel_bias = ids ? channel_x : channel_dst;
|
||||
|
||||
float x_biases[ncols_dst] = { 0.0f };
|
||||
float gate_biases[ncols_dst] = { 0.0f };
|
||||
if constexpr (has_fusion) {
|
||||
const uint32_t channel_bias = ids ? channel_x : channel_dst;
|
||||
if (use_bias) {
|
||||
x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
|
||||
// 1. Hide latency by prefetching bias and gate here
|
||||
@@ -222,6 +236,9 @@ static __global__ void mul_mat_vec_q(
|
||||
float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};
|
||||
|
||||
const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
|
||||
if constexpr (is_multi_token_id) {
|
||||
y += token_idx*stride_col_y;
|
||||
}
|
||||
const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
|
||||
|
||||
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
||||
@@ -275,6 +292,10 @@ static __global__ void mul_mat_vec_q(
|
||||
|
||||
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;
|
||||
|
||||
if constexpr (is_multi_token_id) {
|
||||
dst += token_idx*stride_col_dst;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; ++j) {
|
||||
@@ -335,40 +356,41 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
|
||||
static std::pair<dim3, dim3> calc_launch_params(
|
||||
const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y,
|
||||
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
|
||||
const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
|
||||
const dim3 block_nums(nblocks, nchannels_y, nsamples_y);
|
||||
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
|
||||
return {block_nums, block_dims};
|
||||
}
|
||||
|
||||
template<ggml_type type, int c_ncols_dst>
|
||||
template<ggml_type type, int c_ncols_dst, bool is_multi_token_id = false>
|
||||
static void mul_mat_vec_q_switch_fusion(
|
||||
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
||||
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
|
||||
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
|
||||
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
|
||||
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared, cudaStream_t stream) {
|
||||
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared,
|
||||
const uint32_t ids_stride, cudaStream_t stream) {
|
||||
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
if constexpr (c_ncols_dst == 1) {
|
||||
if (has_fusion) {
|
||||
mul_mat_vec_q<type, c_ncols_dst, true><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
mul_mat_vec_q<type, c_ncols_dst, true, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
|
||||
|
||||
mul_mat_vec_q<type, c_ncols_dst, false><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
mul_mat_vec_q<type, c_ncols_dst, false, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
}
|
||||
|
||||
template <ggml_type type>
|
||||
@@ -379,7 +401,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
|
||||
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
const int ids_stride, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
||||
GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
|
||||
@@ -393,8 +415,19 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
|
||||
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
const bool has_ids = ids != nullptr;
|
||||
|
||||
if (has_ids && ncols_dst > 1) {
|
||||
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
|
||||
constexpr int c_ncols_dst = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(!ids || ncols_dst == 1);
|
||||
switch (ncols_dst) {
|
||||
case 1: {
|
||||
constexpr int c_ncols_dst = 1;
|
||||
@@ -402,7 +435,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
case 2: {
|
||||
constexpr int c_ncols_dst = 2;
|
||||
@@ -410,7 +443,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
case 3: {
|
||||
constexpr int c_ncols_dst = 3;
|
||||
@@ -418,7 +451,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
case 4: {
|
||||
constexpr int c_ncols_dst = 4;
|
||||
@@ -426,7 +459,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
case 5: {
|
||||
constexpr int c_ncols_dst = 5;
|
||||
@@ -434,7 +467,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
case 6: {
|
||||
constexpr int c_ncols_dst = 6;
|
||||
@@ -442,7 +475,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
case 7: {
|
||||
constexpr int c_ncols_dst = 7;
|
||||
@@ -450,7 +483,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
case 8: {
|
||||
constexpr int c_ncols_dst = 8;
|
||||
@@ -458,7 +491,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, stream);
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -474,127 +507,127 @@ static void mul_mat_vec_q_switch_type(
|
||||
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
|
||||
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
cudaStream_t stream) {
|
||||
const int ids_stride, cudaStream_t stream) {
|
||||
switch (type_x) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -622,7 +655,7 @@ void ggml_cuda_mul_mat_vec_q(
|
||||
GGML_ASSERT( nb0 == ts_dst);
|
||||
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
|
||||
|
||||
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
|
||||
GGML_ASSERT(!ids || ne12 <= MMVQ_MAX_BATCH_SIZE);
|
||||
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||
@@ -693,11 +726,13 @@ void ggml_cuda_mul_mat_vec_q(
|
||||
const int64_t stride_channel_dst = ids ? s1 : s2;
|
||||
const int64_t stride_channel_y = ids ? s11 : s12;
|
||||
|
||||
const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
|
||||
|
||||
mul_mat_vec_q_switch_type(
|
||||
src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00,
|
||||
ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
|
||||
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
|
||||
ne03, ne3, s03, s13, s3, stream);
|
||||
ne03, ne3, s03, s13, s3, ids_stride, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec_q(
|
||||
@@ -726,7 +761,7 @@ void ggml_cuda_op_mul_mat_vec_q(
|
||||
ggml_cuda_mm_fusion_args_device fusion_local{};
|
||||
mul_mat_vec_q_switch_type(
|
||||
src0_dd_i, src0->type, src1_ddq_i, nullptr, fusion_local, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, stream);
|
||||
|
||||
GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size);
|
||||
}
|
||||
|
||||
@@ -534,6 +534,36 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv(ggml_metal_
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
const int nsg = 8;
|
||||
const int n = op->src[1]->ne[1];
|
||||
const int k = op->src[1]->ne[0];
|
||||
|
||||
snprintf(base, 256, "kernel_solve_tri_%s", ggml_type_name(op->src[0]->type));
|
||||
snprintf(name, 256, "%s_nsg=%d_n=%d_k=%d", base, nsg, n, k);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
ggml_metal_cv_t cv = ggml_metal_cv_init();
|
||||
|
||||
ggml_metal_cv_set_int16(cv, nsg, FC_SOLVE_TRI + 0);
|
||||
ggml_metal_cv_set_int16(cv, n, FC_SOLVE_TRI + 1);
|
||||
ggml_metal_cv_set_int16(cv, k, FC_SOLVE_TRI + 2);
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
|
||||
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
res.nsg = nsg;
|
||||
res.smem = GGML_PAD(GGML_PAD(n, 32)*nsg*sizeof(float), 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, ggml_type tsrc0, ggml_type tsrc1, int nsg, int nxpsg, int r1ptg) {
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
@@ -121,6 +121,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
||||
@@ -1152,6 +1152,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
return has_simdgroup_reduction;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
return true;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
|
||||
@@ -78,13 +78,14 @@
|
||||
#define FC_MUL_MM 700
|
||||
#define FC_ROPE 800
|
||||
#define FC_SSM_CONV 900
|
||||
#define FC_COUNT_EQUAL 1000
|
||||
#define FC_SOLVE_TRI 1000
|
||||
#define FC_COUNT_EQUAL 1100
|
||||
|
||||
// op-specific constants
|
||||
#define OP_FLASH_ATTN_EXT_NQPTG 8
|
||||
#define OP_FLASH_ATTN_EXT_NQPSG 8
|
||||
#define OP_FLASH_ATTN_EXT_NCPSG 64
|
||||
|
||||
#define OP_FLASH_ATTN_EXT_VEC_NQPTG 1
|
||||
#define OP_FLASH_ATTN_EXT_VEC_NQPSG 1
|
||||
#define OP_FLASH_ATTN_EXT_VEC_NCPSG 32
|
||||
|
||||
// kernel argument structs
|
||||
@@ -733,6 +734,33 @@ typedef struct {
|
||||
uint64_t nb0;
|
||||
} ggml_metal_kargs_ssm_scan;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne01;
|
||||
int32_t ne02;
|
||||
int32_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int32_t ne10;
|
||||
int32_t ne11;
|
||||
int32_t ne12;
|
||||
int32_t ne13;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
int32_t ne0;
|
||||
int32_t ne1;
|
||||
int32_t ne2;
|
||||
int32_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_solve_tri;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00t;
|
||||
int32_t ne00;
|
||||
|
||||
@@ -341,6 +341,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
{
|
||||
n_fuse = ggml_metal_op_rwkv(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
{
|
||||
n_fuse = ggml_metal_op_solve_tri(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
n_fuse = ggml_metal_op_mul_mat(ctx, idx);
|
||||
@@ -1557,6 +1561,63 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||
|
||||
ggml_metal_kargs_solve_tri args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.ne13 =*/ ne13,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
};
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_solve_tri(lib, op);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
const int nsg = pipeline.nsg;
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, pipeline.smem, 0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne10 + nsg - 1)/nsg, ne02, ne03, 32, nsg, 1);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
@@ -2295,7 +2356,7 @@ size_t ggml_metal_op_flash_attn_ext_extra_blk(const ggml_tensor * op) {
|
||||
// return res;
|
||||
//}
|
||||
|
||||
const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPTG : OP_FLASH_ATTN_EXT_NQPTG;
|
||||
const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPSG : OP_FLASH_ATTN_EXT_NQPSG;
|
||||
const int ncpsg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NCPSG : OP_FLASH_ATTN_EXT_NCPSG;
|
||||
|
||||
const int64_t ne1 = (ne01 + nqptg - 1)/nqptg;
|
||||
@@ -2411,7 +2472,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
if (!ggml_metal_op_flash_attn_ext_use_vec(op)) {
|
||||
// half8x8 kernel
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_NQPTG; // queries per threadgroup
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_NQPSG; // queries per threadgroup
|
||||
const int ncpsg = OP_FLASH_ATTN_EXT_NCPSG; // cache values per simdgroup
|
||||
|
||||
GGML_ASSERT(nqptg <= 32);
|
||||
@@ -2578,9 +2639,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
#undef FATTN_SMEM
|
||||
} else {
|
||||
// half4x4 kernel
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPTG; // queries per threadgroup
|
||||
const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPSG; // queries per threadgroup
|
||||
const int ncpsg = OP_FLASH_ATTN_EXT_VEC_NCPSG; // cache values per simdgroup !! sync with kernel template arguments !!
|
||||
const int nkpsg = 1*ncpsg;
|
||||
const int nhptg = 1; // heads per threadgroup
|
||||
|
||||
GGML_ASSERT(nqptg <= 32);
|
||||
GGML_ASSERT(nqptg % 1 == 0);
|
||||
@@ -2632,6 +2693,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
}
|
||||
|
||||
// note: for simplicity assume the K is larger or equal than V
|
||||
GGML_ASSERT(ne10 >= ne20);
|
||||
|
||||
// ne00 + 2*ncpsg*(nsg)
|
||||
// for each query, we load it as f16 in shared memory (ne00)
|
||||
// and store the soft_max values and the mask
|
||||
@@ -2639,28 +2703,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
// ne20*(nsg)
|
||||
// each simdgroup has a full f32 head vector in shared mem to accumulate results
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*GGML_PAD(ne20, 128)*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsgmax = 2;
|
||||
while (true) {
|
||||
const size_t smem = FATTN_SMEM(nsgmax);
|
||||
// avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes
|
||||
if (smem > props_dev->max_theadgroup_memory_size/2) {
|
||||
break;
|
||||
}
|
||||
nsgmax *= 2;
|
||||
}
|
||||
nsgmax /= 2;
|
||||
|
||||
// simdgroups per threadgroup (a.k.a. warps)
|
||||
//const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
|
||||
const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) 1024/32)));
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsg = 1;
|
||||
while (nsg <= nsgt) {
|
||||
nsg *= 2;
|
||||
}
|
||||
nsg /= 2;
|
||||
|
||||
// workgroups
|
||||
// each workgroup handles nsg*nkpsg cache values
|
||||
@@ -2673,7 +2718,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
} else {
|
||||
nwg = 32;
|
||||
nsg = 1;
|
||||
while (2*nwg*nsg*nkpsg < ne11 && nsg < 4) {
|
||||
while (2*nwg*nsg*ncpsg < ne11 && nsg < 4) {
|
||||
nsg *= 2;
|
||||
}
|
||||
}
|
||||
@@ -2739,7 +2784,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1);
|
||||
} else {
|
||||
// sanity checks
|
||||
assert(ggml_metal_op_flash_attn_ext_extra_tmp(op) != 0);
|
||||
@@ -2752,7 +2797,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_encoder_set_buffer(enc, bid_tmp, 7);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1);
|
||||
|
||||
// sync the 2 kernels
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
@@ -60,6 +60,7 @@ int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pool_1d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pool_2d (ggml_metal_op_t ctx, int idx);
|
||||
|
||||
@@ -2737,6 +2737,83 @@ kernel void kernel_rwkv_wkv7_f32(
|
||||
}
|
||||
}
|
||||
|
||||
constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]];
|
||||
constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]];
|
||||
constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]];
|
||||
|
||||
kernel void kernel_solve_tri_f32(
|
||||
constant ggml_metal_kargs_solve_tri & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
ushort3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
|
||||
const short NSG = FC_solve_tri_nsg;
|
||||
const short N = FC_solve_tri_n;
|
||||
const short K = FC_solve_tri_k;
|
||||
const short NP = PAD2(N, NW);
|
||||
|
||||
const int32_t ne02 = args.ne02;
|
||||
const int32_t ne03 = args.ne03;
|
||||
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
const int32_t i01 = tgpig.x*NSG + sgitg;
|
||||
|
||||
threadgroup float * sh0 = (threadgroup float *) shmem;
|
||||
|
||||
device const float * src0_ptr = (device const float *)(src0 + i02 * args.nb02 + i03 * args.nb03) + sgitg*N;
|
||||
device const float * src1_ptr = (device const float *)(src1 + i02 * args.nb12 + i03 * args.nb13) + i01;
|
||||
device float * dst_ptr = (device float *)(dst + i02 * args.nb2 + i03 * args.nb3) + i01;
|
||||
|
||||
for (short rr = 0; rr < N; rr += NSG) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
threadgroup float * sh0_cur = sh0 + sgitg*NP;
|
||||
|
||||
for (short t = 0; t*NW < N; ++t) {
|
||||
const short idx = t*NW + tiisg;
|
||||
sh0_cur[idx] = src0_ptr[idx];
|
||||
}
|
||||
|
||||
src0_ptr += NSG*N;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (i01 >= args.ne10) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (short ir = 0; ir < NSG && rr + ir < N; ++ir) {
|
||||
const short r = rr + ir;
|
||||
|
||||
threadgroup float * sh0_cur = sh0 + ir*NP;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (short t = 0; t*NW < r; ++t) {
|
||||
const short idx = t*NW + tiisg;
|
||||
sum += sh0_cur[idx] * dst_ptr[idx*K] * (idx < r);
|
||||
}
|
||||
|
||||
sum = simd_sum(sum);
|
||||
|
||||
if (tiisg == 0) {
|
||||
const float diag = sh0_cur[r];
|
||||
|
||||
dst_ptr[r*K] = (src1_ptr[r*K] - sum) / diag;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_argmax_f32(
|
||||
constant ggml_metal_kargs_argmax & args,
|
||||
device const char * src0,
|
||||
@@ -5931,7 +6008,7 @@ template<
|
||||
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short Q = OP_FLASH_ATTN_EXT_NQPTG, // queries per threadgroup
|
||||
short Q = OP_FLASH_ATTN_EXT_NQPSG, // queries per threadgroup
|
||||
short C = OP_FLASH_ATTN_EXT_NCPSG> // cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext(
|
||||
constant ggml_metal_kargs_flash_attn_ext & args,
|
||||
@@ -6141,11 +6218,10 @@ template<
|
||||
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short NE, // head elements per thread
|
||||
short Q, // queries per threadgroup
|
||||
short C, // cache items per threadgroup
|
||||
short NSG> // number of simd groups
|
||||
void kernel_flash_attn_ext_vec_impl(
|
||||
short NE = 4, // head elements per thread
|
||||
short Q = OP_FLASH_ATTN_EXT_VEC_NQPSG, // queries per threadgroup
|
||||
short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec(
|
||||
constant ggml_metal_kargs_flash_attn_ext_vec & args,
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
@@ -6162,6 +6238,7 @@ void kernel_flash_attn_ext_vec_impl(
|
||||
static_assert(DV % 32 == 0, "DV must be divisible by 32");
|
||||
|
||||
#define NWG (FC_flash_attn_ext_vec_nwg)
|
||||
#define NSG (FC_flash_attn_ext_vec_nsg)
|
||||
|
||||
#define NS10 (FC_flash_attn_ext_vec_ns10)
|
||||
#define NS20 (FC_flash_attn_ext_vec_ns20)
|
||||
@@ -6190,12 +6267,12 @@ void kernel_flash_attn_ext_vec_impl(
|
||||
|
||||
const short T = PK + NSG*SH; // shared memory size per query in (half)
|
||||
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*PK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*PK); // same as above but in s4_t
|
||||
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + Q*PK); // scratch buffer for mask
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + Q*T); // scratch buffer for the results
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + NSG*PK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + NSG*PK); // same as above but in s4_t
|
||||
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + NSG*PK); // scratch buffer for mask
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + NSG*PK + NSG*SH); // scratch buffer for the results
|
||||
|
||||
// store the result for all queries in shared memory (the O matrix from the paper)
|
||||
so4 += tiisg;
|
||||
@@ -6213,11 +6290,13 @@ void kernel_flash_attn_ext_vec_impl(
|
||||
// load heads from Q to shared memory
|
||||
device const float4 * q4 = (device const float4 *) ((device const char *) q);
|
||||
|
||||
for (short i = tiisg; i < PK4; i += NW) {
|
||||
if (iq1 < args.ne01 && i < DK4) {
|
||||
sq4[i] = (q4_t) q4[i];
|
||||
} else {
|
||||
sq4[i] = (q4_t) 0.0f;
|
||||
if (iq1 < args.ne01) {
|
||||
for (short i = tiisg; i < PK4; i += NW) {
|
||||
if (i < DK4) {
|
||||
sq4[i] = (q4_t) q4[i];
|
||||
} else {
|
||||
sq4[i] = (q4_t) 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6295,7 +6374,7 @@ void kernel_flash_attn_ext_vec_impl(
|
||||
}
|
||||
|
||||
// skip -INF blocks
|
||||
if (simd_max(sm[tiisg]) == -INFINITY) {
|
||||
if (simd_max(sm[tiisg]) <= -MAXHALF) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -6569,57 +6648,11 @@ void kernel_flash_attn_ext_vec_impl(
|
||||
}
|
||||
|
||||
#undef NWG
|
||||
#undef NSG
|
||||
#undef NS10
|
||||
#undef NS20
|
||||
}
|
||||
|
||||
template<
|
||||
typename q4_t, // query types in shared memory
|
||||
typename k4_t, // key types in shared memory
|
||||
typename v4_t, // value types in shared memory
|
||||
typename qk_t, // Q*K types
|
||||
typename s_t, // soft-max types
|
||||
typename s4_t,
|
||||
typename o4_t, // attention accumulation types
|
||||
typename kd4_t, // key type in device memory
|
||||
short nl_k,
|
||||
void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
|
||||
typename vd4_t, // value type in device memory
|
||||
short nl_v,
|
||||
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
|
||||
short DK, // K head size
|
||||
short DV, // V head size
|
||||
short NE = 4, // head elements per thread
|
||||
short Q = OP_FLASH_ATTN_EXT_VEC_NQPTG, // queries per threadgroup
|
||||
short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec(
|
||||
constant ggml_metal_kargs_flash_attn_ext_vec & args,
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device const char * sinks,
|
||||
device const char * pad,
|
||||
device char * dst,
|
||||
threadgroup half * shmem_f16 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
#define FWD_TMPL q4_t, k4_t, v4_t, qk_t, s_t, s4_t, o4_t, kd4_t, nl_k, deq_k_t4, vd4_t, nl_v, deq_v_t4, DK, DV, NE, Q, C
|
||||
#define FWD_ARGS args, q, k, v, mask, sinks, pad, dst, shmem_f16, tgpig, tiisg, sgitg
|
||||
switch (FC_flash_attn_ext_vec_nsg) {
|
||||
// note: disabled cases to reduce library load time
|
||||
case 1: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 1>(FWD_ARGS); break;
|
||||
case 2: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 2>(FWD_ARGS); break;
|
||||
case 4: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 4>(FWD_ARGS); break;
|
||||
//case 8: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 8>(FWD_ARGS); break;
|
||||
//case 16: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 16>(FWD_ARGS); break;
|
||||
//case 32: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 32>(FWD_ARGS); break;
|
||||
}
|
||||
#undef FWD_TMPL
|
||||
#undef FWD_ARGS
|
||||
}
|
||||
|
||||
// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem
|
||||
// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
|
||||
//
|
||||
|
||||
@@ -36,7 +36,7 @@ apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor) {
|
||||
result.data = reinterpret_cast<uint64_t>(tensor->data);
|
||||
if (tensor->data) {
|
||||
if (!tensor->buffer) {
|
||||
GGML_ABORT("tensor has data but not buffer");
|
||||
GGML_ABORT("%s: tensor has data but not buffer", __func__);
|
||||
}
|
||||
// tensor->data is serialized as an offset to the buffer base address
|
||||
result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
|
||||
|
||||
@@ -27,7 +27,7 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
|
||||
|
||||
const void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
|
||||
if (!shmem_data) {
|
||||
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
|
||||
apir_decoder_set_fatal(dec);
|
||||
return 1;
|
||||
}
|
||||
@@ -45,7 +45,7 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
|
||||
if (dev->iface.supports_op(dev, op)) {
|
||||
continue;
|
||||
}
|
||||
GGML_LOG_ERROR("Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
|
||||
|
||||
status = GGML_STATUS_ABORTED;
|
||||
apir_encode_ggml_status(enc, &status);
|
||||
|
||||
@@ -36,18 +36,22 @@ uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec
|
||||
ggml_backend_buffer_type_t buft;
|
||||
buft = apir_decode_ggml_buffer_type(dec);
|
||||
|
||||
size_t value = buft->iface.get_max_size(buft);
|
||||
size_t value = SIZE_MAX;
|
||||
if (buft->iface.get_max_size) {
|
||||
value = buft->iface.get_max_size(buft);
|
||||
}
|
||||
|
||||
apir_encode_size_t(enc, &value);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */
|
||||
uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
|
||||
GGML_UNUSED(ctx);
|
||||
ggml_backend_buffer_type_t buft;
|
||||
buft = apir_decode_ggml_buffer_type(dec);
|
||||
GGML_UNUSED(dec);
|
||||
const bool is_host = false;
|
||||
|
||||
bool is_host = buft->iface.is_host(buft);
|
||||
apir_encode_bool_t(enc, &is_host);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -40,7 +40,7 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
|
||||
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
|
||||
|
||||
if (!shmem_data) {
|
||||
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
|
||||
|
||||
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
|
||||
if (!shmem_data) {
|
||||
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -121,7 +121,7 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
|
||||
buffer = apir_decode_ggml_buffer(dec);
|
||||
|
||||
if (!apir_untrack_backend_buffer(buffer)) {
|
||||
GGML_LOG_WARN("%s: unknown buffer %p\n", __func__, (void *) buffer);
|
||||
GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec,
|
||||
|
||||
void * shmem_ptr = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
|
||||
if (!shmem_ptr) {
|
||||
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
|
||||
apir_decoder_set_fatal(dec);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -17,26 +17,26 @@ uint64_t timer_count = 0;
|
||||
|
||||
uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
|
||||
if (reg != NULL) {
|
||||
GGML_LOG_WARN("%s: already initialized\n", __func__);
|
||||
GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: already initialized\n", __func__);
|
||||
return APIR_BACKEND_INITIALIZE_ALREADY_INITED;
|
||||
}
|
||||
ggml_backend_reg_t (*ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
|
||||
|
||||
reg = ggml_backend_reg_fct();
|
||||
if (reg == NULL) {
|
||||
GGML_LOG_ERROR("%s: backend registration failed\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend registration failed\n", __func__);
|
||||
return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
|
||||
}
|
||||
|
||||
if (!reg->iface.get_device_count(reg)) {
|
||||
GGML_LOG_ERROR("%s: backend initialization failed: no device found\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__);
|
||||
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
|
||||
}
|
||||
|
||||
dev = reg->iface.get_device(reg, 0);
|
||||
|
||||
if (!dev) {
|
||||
GGML_LOG_ERROR("%s: backend initialization failed: no device received\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__);
|
||||
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
|
||||
}
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec,
|
||||
uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
|
||||
uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
|
||||
uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */
|
||||
uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
|
||||
uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
|
||||
uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
|
||||
@@ -62,7 +63,7 @@ static inline const char * backend_dispatch_command_name(ApirBackendCommandType
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
|
||||
return "backend_buffer_type_get_max_size";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
|
||||
return "backend_buffer_type_is_host";
|
||||
return "backend_buffer_type_is_host (DEPRECATED)";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
|
||||
return "backend_buffer_type_alloc_buffer";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
|
||||
@@ -110,7 +111,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = */ backend_buffer_type_get_name,
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = */ backend_buffer_type_get_alignment,
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = */ backend_buffer_type_get_max_size,
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host,
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host /* DEPRECATED */,
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = */ backend_buffer_type_alloc_buffer,
|
||||
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = */ backend_buffer_type_get_alloc_size,
|
||||
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
#include "shared/apir_cs.h"
|
||||
#include "shared/apir_cs_ggml.h"
|
||||
|
||||
#define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: "
|
||||
|
||||
struct virgl_apir_context {
|
||||
uint32_t ctx_id;
|
||||
virgl_apir_callbacks * iface;
|
||||
|
||||
@@ -35,14 +35,8 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) {
|
||||
buffer->iface.free_buffer(buffer);
|
||||
}
|
||||
|
||||
if (dev) {
|
||||
size_t free, total;
|
||||
dev->iface.get_memory(dev, &free, &total);
|
||||
GGML_LOG_INFO("%s: free memory: %ld MB\n", __func__, (size_t) free / 1024 / 1024);
|
||||
}
|
||||
|
||||
if (backend_library_handle) {
|
||||
GGML_LOG_INFO("%s: The GGML backend library was loaded. Unloading it.\n", __func__);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU_BCK "The GGML backend library was loaded. Unloading it.\n");
|
||||
dlclose(backend_library_handle);
|
||||
backend_library_handle = NULL;
|
||||
}
|
||||
@@ -65,7 +59,7 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
|
||||
if (apir_logfile) {
|
||||
ggml_log_set(log_to_file_callback, apir_logfile);
|
||||
} else {
|
||||
GGML_LOG_INFO("Could not open the log file at '%s'\n", apir_log_to_file);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU_BCK "Could not open the log file at '%s'\n", apir_log_to_file);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,7 +68,10 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
|
||||
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
|
||||
|
||||
if (!library_name) {
|
||||
GGML_LOG_ERROR("cannot open the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot open the GGML library: env var '%s' not defined\n",
|
||||
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
|
||||
|
||||
|
||||
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
|
||||
}
|
||||
@@ -82,13 +79,16 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
|
||||
backend_library_handle = dlopen(library_name, RTLD_LAZY);
|
||||
|
||||
if (!backend_library_handle) {
|
||||
GGML_LOG_ERROR("cannot open the GGML library: %s\n", dlerror());
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot open the GGML library: %s\n", __func__, dlerror());
|
||||
|
||||
return APIR_LOAD_LIBRARY_CANNOT_OPEN;
|
||||
}
|
||||
|
||||
if (!library_reg) {
|
||||
GGML_LOG_ERROR("cannot register the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot register the GGML library: env var '%s' not defined\n",
|
||||
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
|
||||
|
||||
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
|
||||
}
|
||||
@@ -96,8 +96,10 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
|
||||
void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
|
||||
dlsym_error = dlerror();
|
||||
if (dlsym_error) {
|
||||
GGML_LOG_ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s\n", library_reg,
|
||||
APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
|
||||
__func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
|
||||
|
||||
|
||||
return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
|
||||
}
|
||||
@@ -134,7 +136,9 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
|
||||
};
|
||||
|
||||
if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
|
||||
GGML_LOG_ERROR("Received an invalid dispatch index (%d >= %d)\n", cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: Received an invalid dispatch index (%d >= %d)\n",
|
||||
__func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
|
||||
return APIR_BACKEND_FORWARD_INDEX_INVALID;
|
||||
}
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ static inline bool apir_decoder_peek_internal(apir_decoder * dec,
|
||||
assert(val_size <= size);
|
||||
|
||||
if (unlikely(size > (size_t) (dec->end - dec->cur))) {
|
||||
GGML_LOG_ERROR("reading too much from the decoder ...\n");
|
||||
GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__);
|
||||
apir_decoder_set_fatal(dec);
|
||||
memset(val, 0, val_size);
|
||||
return false;
|
||||
@@ -103,7 +103,7 @@ static inline void apir_decoder_peek(apir_decoder * dec, size_t size, void * val
|
||||
|
||||
static inline const void * apir_decoder_use_inplace(apir_decoder * dec, size_t size) {
|
||||
if (unlikely(size > (size_t) (dec->end - dec->cur))) {
|
||||
GGML_LOG_ERROR("reading too much from the decoder ...\n");
|
||||
GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__);
|
||||
apir_decoder_set_fatal(dec);
|
||||
return NULL;
|
||||
}
|
||||
@@ -221,7 +221,7 @@ static inline uint64_t apir_decode_array_size(apir_decoder * dec, uint64_t expec
|
||||
uint64_t size;
|
||||
apir_decode_uint64_t(dec, &size);
|
||||
if (size != expected_size) {
|
||||
GGML_LOG_ERROR("Couldn't decode array from the decoder\n");
|
||||
GGML_LOG_ERROR("%s: Couldn't decode array from the decoder\n", __func__);
|
||||
apir_decoder_set_fatal(dec);
|
||||
size = 0;
|
||||
}
|
||||
@@ -322,7 +322,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t
|
||||
if (size) {
|
||||
val[size - 1] = '\0';
|
||||
} else {
|
||||
GGML_LOG_ERROR("Couldn't decode the blog array\n");
|
||||
GGML_LOG_ERROR("%s: Couldn't decode the blog array\n", __func__);
|
||||
apir_decoder_set_fatal(dec);
|
||||
}
|
||||
}
|
||||
@@ -332,7 +332,8 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t
|
||||
static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
|
||||
size_t alloc_size;
|
||||
if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
|
||||
GGML_LOG_ERROR("overflow in array allocation of %zu * %zu bytes\n", size, count);
|
||||
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n",
|
||||
__func__, size, count);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -39,11 +39,17 @@ static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor
|
||||
|
||||
static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
|
||||
const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
|
||||
|
||||
if (!apir_rpc_tensor) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_init_params params{
|
||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
|
||||
const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
|
||||
@@ -71,6 +77,10 @@ static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decod
|
||||
return (ggml_backend_buffer_type_t) handle;
|
||||
}
|
||||
|
||||
static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) {
|
||||
apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
|
||||
}
|
||||
|
||||
static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
|
||||
apir_buffer_type_host_handle_t handle;
|
||||
|
||||
@@ -154,13 +164,13 @@ static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml
|
||||
size_t tensor_size = sizeof(*tensor);
|
||||
|
||||
if (tensor->extra) {
|
||||
GGML_ABORT("Cannot pass tensors with extra");
|
||||
GGML_ABORT("%s: Cannot pass tensors with extra", __func__);
|
||||
}
|
||||
|
||||
if (tensor->src[0] && tensor->buffer) {
|
||||
static int first = 1;
|
||||
if (first) {
|
||||
GGML_LOG_WARN("Cannot pass tensors with src and buffer\n");
|
||||
GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__);
|
||||
first = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
|
||||
|
||||
ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
|
||||
if (!context) {
|
||||
GGML_ABORT("Couldn't allocate the buffer context ...");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the buffer context ...", __func__);
|
||||
}
|
||||
|
||||
context->gpu = gpu;
|
||||
@@ -20,7 +20,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
|
||||
context->base = context->apir_context.shmem.mmap_ptr;
|
||||
context->is_from_ptr = true;
|
||||
} else {
|
||||
context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
|
||||
context->apir_context = apir_buffer_type_alloc_buffer(gpu, gpu->cached_buffer_type.host_handle, size);
|
||||
context->is_from_ptr = false;
|
||||
context->base = NULL;
|
||||
}
|
||||
@@ -34,36 +34,19 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
|
||||
static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
virtgpu * gpu = BUFT_TO_GPU(buft);
|
||||
|
||||
return apir_buffer_type_get_name(gpu, buft);
|
||||
return gpu->cached_buffer_type.name;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
virtgpu * gpu = BUFT_TO_GPU(buft);
|
||||
|
||||
static size_t align = 0;
|
||||
|
||||
if (align == 0) {
|
||||
align = apir_buffer_type_get_alignment(gpu, buft);
|
||||
}
|
||||
|
||||
return align;
|
||||
return gpu->cached_buffer_type.alignment;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
virtgpu * gpu = BUFT_TO_GPU(buft);
|
||||
|
||||
static size_t max_size = 0;
|
||||
if (max_size == 0) {
|
||||
max_size = apir_buffer_type_get_max_size(gpu, buft);
|
||||
}
|
||||
|
||||
return max_size;
|
||||
}
|
||||
|
||||
static bool ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
virtgpu * gpu = BUFT_TO_GPU(buft);
|
||||
|
||||
return apir_buffer_type_is_host(gpu, buft);
|
||||
return gpu->cached_buffer_type.max_size;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
|
||||
@@ -76,7 +59,7 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
return apir_buffer_type_get_alloc_size(gpu, buft, tensor);
|
||||
return apir_buffer_type_get_alloc_size(gpu, gpu->cached_buffer_type.host_handle, tensor);
|
||||
}
|
||||
|
||||
const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
|
||||
|
||||
@@ -3,32 +3,27 @@
|
||||
static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
return apir_device_get_name(gpu);
|
||||
return gpu->cached_device_info.name;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
return apir_device_get_description(gpu);
|
||||
// Return the pre-cached description from the virtgpu structure
|
||||
return gpu->cached_device_info.description;
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
static enum ggml_backend_dev_type type;
|
||||
static bool has_type = false;
|
||||
if (!has_type) {
|
||||
has_type = true;
|
||||
type = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
|
||||
}
|
||||
|
||||
return type;
|
||||
return (enum ggml_backend_dev_type) gpu->cached_device_info.type;
|
||||
}
|
||||
|
||||
static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
return apir_device_get_memory(gpu, free, total);
|
||||
*free = gpu->cached_device_info.memory_free;
|
||||
*total = gpu->cached_device_info.memory_total;
|
||||
}
|
||||
|
||||
static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
@@ -77,13 +72,22 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_
|
||||
ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
|
||||
static std::atomic<bool> initialized = false;
|
||||
static ggml_backend_buffer_type buft;
|
||||
|
||||
static ggml_backend_buffer_type buft{
|
||||
/* .iface = */ ggml_backend_remoting_buffer_type_interface,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ (void *) ctx,
|
||||
};
|
||||
if (!initialized) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
if (!initialized) {
|
||||
buft = {
|
||||
/* .iface = */ ggml_backend_remoting_buffer_type_interface,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ (void *) gpu->cached_buffer_type.host_handle,
|
||||
};
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
return &buft;
|
||||
}
|
||||
@@ -91,13 +95,22 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac
|
||||
static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
|
||||
static std::atomic<bool> initialized = false;
|
||||
static ggml_backend_buffer_type buft;
|
||||
|
||||
static ggml_backend_buffer_type buft{
|
||||
/* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ (void *) ctx,
|
||||
};
|
||||
if (!initialized) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
if (!initialized) {
|
||||
buft = {
|
||||
/* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ (void *) gpu->cached_buffer_type.host_handle,
|
||||
};
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
return &buft;
|
||||
}
|
||||
@@ -110,7 +123,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_b
|
||||
|
||||
ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
|
||||
if (!context) {
|
||||
GGML_ABORT("Couldn't allocate the buffer context ...");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the buffer context ...", __func__);
|
||||
}
|
||||
|
||||
context->gpu = gpu;
|
||||
|
||||
@@ -4,37 +4,70 @@
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
|
||||
void ggml_virtgpu_cleanup(virtgpu * gpu);
|
||||
|
||||
static virtgpu * apir_initialize() {
|
||||
static virtgpu * apir_gpu_instance = NULL;
|
||||
static bool apir_initialized = false;
|
||||
static virtgpu * gpu = NULL;
|
||||
static std::atomic<bool> initialized = false;
|
||||
|
||||
if (initialized) {
|
||||
// fast track
|
||||
return gpu;
|
||||
}
|
||||
|
||||
{
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
if (apir_initialized) {
|
||||
return apir_gpu_instance;
|
||||
if (initialized) {
|
||||
// thread safe
|
||||
return gpu;
|
||||
}
|
||||
|
||||
apir_gpu_instance = create_virtgpu();
|
||||
if (!apir_gpu_instance) {
|
||||
GGML_ABORT("failed to initialize the virtgpu");
|
||||
gpu = create_virtgpu();
|
||||
if (!gpu) {
|
||||
initialized = true;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
apir_initialized = true;
|
||||
// Pre-fetch and cache all device information, it will not change
|
||||
gpu->cached_device_info.description = apir_device_get_description(gpu);
|
||||
if (!gpu->cached_device_info.description) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__);
|
||||
}
|
||||
gpu->cached_device_info.name = apir_device_get_name(gpu);
|
||||
if (!gpu->cached_device_info.name) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__);
|
||||
}
|
||||
gpu->cached_device_info.device_count = apir_device_get_count(gpu);
|
||||
gpu->cached_device_info.type = apir_device_get_type(gpu);
|
||||
|
||||
apir_device_get_memory(gpu,
|
||||
&gpu->cached_device_info.memory_free,
|
||||
&gpu->cached_device_info.memory_total);
|
||||
|
||||
apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu);
|
||||
gpu->cached_buffer_type.host_handle = buft_host_handle;
|
||||
gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle);
|
||||
if (!gpu->cached_buffer_type.name) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__);
|
||||
}
|
||||
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
|
||||
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
return apir_gpu_instance;
|
||||
return gpu;
|
||||
}
|
||||
|
||||
static int ggml_backend_remoting_get_device_count() {
|
||||
virtgpu * gpu = apir_initialize();
|
||||
if (!gpu) {
|
||||
GGML_LOG_WARN("apir_initialize failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return apir_device_get_count(gpu);
|
||||
return gpu->cached_device_info.device_count;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
@@ -52,17 +85,21 @@ ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) {
|
||||
|
||||
static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
|
||||
if (devices.size() > 0) {
|
||||
GGML_LOG_INFO("%s: already initialized\n", __func__);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "%s: already initialized\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
virtgpu * gpu = apir_initialize();
|
||||
if (!gpu) {
|
||||
GGML_LOG_ERROR("apir_initialize failed\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: apir_initialize failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
static bool initialized = false;
|
||||
static std::atomic<bool> initialized = false;
|
||||
|
||||
if (initialized) {
|
||||
return; // fast track
|
||||
}
|
||||
|
||||
{
|
||||
static std::mutex mutex;
|
||||
@@ -70,10 +107,10 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
|
||||
if (!initialized) {
|
||||
for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) {
|
||||
ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
|
||||
char desc[256] = "API Remoting device";
|
||||
char desc[256] = "ggml-virtgpu API Remoting device";
|
||||
|
||||
ctx->device = i;
|
||||
ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
|
||||
ctx->name = GGML_VIRTGPU_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->gpu = gpu;
|
||||
|
||||
@@ -98,7 +135,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
|
||||
static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
|
||||
UNUSED(reg);
|
||||
|
||||
return GGML_REMOTING_FRONTEND_NAME;
|
||||
return GGML_VIRTGPU_NAME;
|
||||
}
|
||||
|
||||
static const ggml_backend_reg_i ggml_backend_remoting_reg_i = {
|
||||
@@ -111,8 +148,7 @@ static const ggml_backend_reg_i ggml_backend_remoting_reg_i = {
|
||||
ggml_backend_reg_t ggml_backend_virtgpu_reg() {
|
||||
virtgpu * gpu = apir_initialize();
|
||||
if (!gpu) {
|
||||
GGML_LOG_ERROR("virtgpu_apir_initialize failed\n");
|
||||
return NULL;
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: virtgpu_apir_initialize failed\n", __func__);
|
||||
}
|
||||
|
||||
static ggml_backend_reg reg = {
|
||||
@@ -129,9 +165,25 @@ ggml_backend_reg_t ggml_backend_virtgpu_reg() {
|
||||
|
||||
ggml_backend_remoting_reg_init_devices(®);
|
||||
|
||||
GGML_LOG_INFO("%s: initialized\n", __func__);
|
||||
|
||||
return ®
|
||||
}
|
||||
|
||||
// public function, not exposed in the GGML interface at the moment
|
||||
void ggml_virtgpu_cleanup(virtgpu * gpu) {
|
||||
if (gpu->cached_device_info.name) {
|
||||
free(gpu->cached_device_info.name);
|
||||
gpu->cached_device_info.name = NULL;
|
||||
}
|
||||
if (gpu->cached_device_info.description) {
|
||||
free(gpu->cached_device_info.description);
|
||||
gpu->cached_device_info.description = NULL;
|
||||
}
|
||||
if (gpu->cached_buffer_type.name) {
|
||||
free(gpu->cached_buffer_type.name);
|
||||
gpu->cached_buffer_type.name = NULL;
|
||||
}
|
||||
|
||||
mtx_destroy(&gpu->data_shmem_mutex);
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg)
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#define GGML_VIRTGPU_NAME "ggml-virtgpu"
|
||||
#define GGML_VIRTGPU "ggml-virtgpu: "
|
||||
|
||||
// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes
|
||||
|
||||
#define USE_ALWAYS_TRUE_SUPPORTS_OP 1
|
||||
@@ -62,7 +65,7 @@ static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggm
|
||||
|
||||
static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
|
||||
if (!buffer->context) {
|
||||
GGML_ABORT("%s: no context available :/", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: no context available :/", __func__);
|
||||
}
|
||||
return BUFFER_TO_HOST_HANDLE(buffer);
|
||||
}
|
||||
|
||||
@@ -24,10 +24,10 @@ functions:
|
||||
frontend_return: "int"
|
||||
|
||||
get_name:
|
||||
frontend_return: "const char *"
|
||||
frontend_return: "char *"
|
||||
|
||||
get_description:
|
||||
frontend_return: "const char *"
|
||||
frontend_return: "char *"
|
||||
|
||||
get_type:
|
||||
frontend_return: "uint32_t"
|
||||
@@ -64,35 +64,33 @@ functions:
|
||||
group_description: "buffer-type"
|
||||
functions:
|
||||
get_name:
|
||||
frontend_return: "const char *"
|
||||
frontend_return: "char *"
|
||||
frontend_extra_params:
|
||||
- "ggml_backend_buffer_type_t buft"
|
||||
- "apir_buffer_type_host_handle_t host_handle"
|
||||
|
||||
get_alignment:
|
||||
frontend_return: "size_t"
|
||||
frontend_extra_params:
|
||||
- "ggml_backend_buffer_type_t buft"
|
||||
- "apir_buffer_type_host_handle_t host_handle"
|
||||
|
||||
get_max_size:
|
||||
frontend_return: "size_t"
|
||||
frontend_extra_params:
|
||||
- "ggml_backend_buffer_type_t buft"
|
||||
- "apir_buffer_type_host_handle_t host_handle"
|
||||
|
||||
is_host:
|
||||
frontend_return: "bool"
|
||||
frontend_extra_params:
|
||||
- "ggml_backend_buffer_type_t buft"
|
||||
deprecated: true
|
||||
|
||||
alloc_buffer:
|
||||
frontend_return: "apir_buffer_context_t"
|
||||
frontend_extra_params:
|
||||
- "ggml_backend_buffer_type_t buffer_buft"
|
||||
- "apir_buffer_type_host_handle_t host_handle"
|
||||
- "size_t size"
|
||||
|
||||
get_alloc_size:
|
||||
frontend_return: "size_t"
|
||||
frontend_extra_params:
|
||||
- "ggml_backend_buffer_type_t buft"
|
||||
- "apir_buffer_type_host_handle_t host_handle"
|
||||
- "const ggml_tensor *op"
|
||||
|
||||
buffer:
|
||||
|
||||
@@ -116,7 +116,7 @@ class RemotingCodebaseGenerator:
|
||||
'frontend_return': func_metadata.get('frontend_return', 'void'),
|
||||
'frontend_extra_params': func_metadata.get('frontend_extra_params', []),
|
||||
'group_description': group_description,
|
||||
'newly_added': func_metadata.get('newly_added', False)
|
||||
'deprecated': func_metadata.get('deprecated', False),
|
||||
})
|
||||
enum_value += 1
|
||||
|
||||
@@ -165,6 +165,9 @@ class RemotingCodebaseGenerator:
|
||||
|
||||
signature = "uint32_t"
|
||||
params = "apir_encoder *enc, apir_decoder *dec, virgl_apir_context *ctx"
|
||||
if func['deprecated']:
|
||||
decl_lines.append(f"/* {func['enum_name']} is deprecated. Keeping the handler for backward compatibility. */")
|
||||
|
||||
decl_lines.append(f"{signature} {func['backend_function']}({params});")
|
||||
|
||||
# Switch cases
|
||||
@@ -176,7 +179,9 @@ class RemotingCodebaseGenerator:
|
||||
switch_lines.append(f" /* {func['group_description']} */")
|
||||
current_group = func['group_name']
|
||||
|
||||
switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}\";")
|
||||
deprecated = " (DEPRECATED)" if func['deprecated'] else ""
|
||||
|
||||
switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";")
|
||||
|
||||
# Dispatch table
|
||||
table_lines = []
|
||||
@@ -188,7 +193,8 @@ class RemotingCodebaseGenerator:
|
||||
table_lines.append("")
|
||||
current_group = func['group_name']
|
||||
|
||||
table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']},")
|
||||
deprecated = " /* DEPRECATED */" if func['deprecated'] else ""
|
||||
table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']}{deprecated},")
|
||||
|
||||
header_content = f'''\
|
||||
#pragma once
|
||||
@@ -225,6 +231,10 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
|
||||
decl_lines.append(f"/* {func['group_description']} */")
|
||||
current_group = func['group_name']
|
||||
|
||||
if func['deprecated']:
|
||||
decl_lines.append(f"/* {func['frontend_function']} is deprecated. */")
|
||||
continue
|
||||
|
||||
# Build parameter list
|
||||
params = [self.naming_patterns['frontend_base_param']]
|
||||
params.extend(func['frontend_extra_params'])
|
||||
@@ -287,7 +297,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
|
||||
generated_files = [apir_backend_path, backend_dispatched_path, virtgpu_forward_path]
|
||||
|
||||
if not self.clang_format_available:
|
||||
logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted."
|
||||
logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted.\n"
|
||||
" Install clang-format to enable automatic code formatting.")
|
||||
else:
|
||||
logging.info("\n🎨 Formatting files with clang-format...")
|
||||
|
||||
@@ -18,12 +18,17 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
|
||||
|
||||
virtgpu_shmem temp_shmem; // Local storage for large buffers
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
|
||||
if (cgraph_size <= gpu->data_shmem.mmap_size) {
|
||||
// prefer the init-time allocated page, if large enough
|
||||
// Lock mutex before using shared data_shmem buffer
|
||||
if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
|
||||
}
|
||||
using_shared_shmem = true;
|
||||
shmem = &gpu->data_shmem;
|
||||
} else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
|
||||
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
|
||||
}
|
||||
|
||||
apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
|
||||
@@ -42,7 +47,10 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
|
||||
|
||||
remote_call_finish(gpu, encoder, decoder);
|
||||
|
||||
if (shmem != &gpu->data_shmem) {
|
||||
// Unlock mutex before cleanup
|
||||
if (using_shared_shmem) {
|
||||
mtx_unlock(&gpu->data_shmem_mutex);
|
||||
} else {
|
||||
virtgpu_shmem_destroy(gpu, shmem);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
#include "virtgpu-forward-impl.h"
|
||||
|
||||
const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
|
||||
char * apir_buffer_type_get_name(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
|
||||
|
||||
apir_encode_ggml_buffer_type(encoder, buft);
|
||||
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
|
||||
|
||||
REMOTE_CALL(gpu, encoder, decoder, ret);
|
||||
|
||||
const size_t string_size = apir_decode_array_size_unchecked(decoder);
|
||||
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
|
||||
if (!string) {
|
||||
GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__);
|
||||
apir_decoder_set_fatal(decoder);
|
||||
}
|
||||
apir_decode_char_array(decoder, string, string_size);
|
||||
@@ -24,14 +24,14 @@ const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t
|
||||
return string;
|
||||
}
|
||||
|
||||
size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
|
||||
size_t apir_buffer_type_get_alignment(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
|
||||
|
||||
apir_encode_ggml_buffer_type(encoder, buft);
|
||||
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
|
||||
|
||||
REMOTE_CALL(gpu, encoder, decoder, ret);
|
||||
|
||||
@@ -43,14 +43,14 @@ size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t
|
||||
return alignment;
|
||||
}
|
||||
|
||||
size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
|
||||
size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
|
||||
|
||||
apir_encode_ggml_buffer_type(encoder, buft);
|
||||
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
|
||||
|
||||
REMOTE_CALL(gpu, encoder, decoder, ret);
|
||||
|
||||
@@ -62,26 +62,7 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t b
|
||||
return max_size;
|
||||
}
|
||||
|
||||
bool apir_buffer_type_is_host(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
|
||||
|
||||
apir_encode_ggml_buffer_type(encoder, buft);
|
||||
|
||||
REMOTE_CALL(gpu, encoder, decoder, ret);
|
||||
|
||||
bool is_host;
|
||||
apir_decode_bool_t(decoder, &is_host);
|
||||
|
||||
remote_call_finish(gpu, encoder, decoder);
|
||||
|
||||
return is_host;
|
||||
}
|
||||
|
||||
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) {
|
||||
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
@@ -90,7 +71,7 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_
|
||||
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
|
||||
|
||||
apir_encode_ggml_buffer_type(encoder, buft);
|
||||
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
|
||||
|
||||
apir_encode_size_t(encoder, &size);
|
||||
|
||||
@@ -103,14 +84,14 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_
|
||||
return buffer_context;
|
||||
}
|
||||
|
||||
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op) {
|
||||
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE);
|
||||
|
||||
apir_encode_ggml_buffer_type(encoder, buft);
|
||||
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
|
||||
|
||||
apir_encode_ggml_tensor_inline(encoder, op);
|
||||
|
||||
|
||||
@@ -36,13 +36,18 @@ void apir_buffer_set_tensor(virtgpu * gpu,
|
||||
|
||||
virtgpu_shmem temp_shmem; // Local storage for large buffers
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
|
||||
if (size <= gpu->data_shmem.mmap_size) {
|
||||
// prefer the init-time allocated page, if large enough
|
||||
// Lock mutex before using shared data_shmem buffer
|
||||
if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
|
||||
}
|
||||
using_shared_shmem = true;
|
||||
shmem = &gpu->data_shmem;
|
||||
|
||||
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
|
||||
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
|
||||
}
|
||||
|
||||
memcpy(shmem->mmap_ptr, data, size);
|
||||
@@ -55,7 +60,10 @@ void apir_buffer_set_tensor(virtgpu * gpu,
|
||||
|
||||
remote_call_finish(gpu, encoder, decoder);
|
||||
|
||||
if (shmem != &gpu->data_shmem) {
|
||||
// Unlock mutex before cleanup
|
||||
if (using_shared_shmem) {
|
||||
mtx_unlock(&gpu->data_shmem_mutex);
|
||||
} else {
|
||||
virtgpu_shmem_destroy(gpu, shmem);
|
||||
}
|
||||
|
||||
@@ -79,13 +87,18 @@ void apir_buffer_get_tensor(virtgpu * gpu,
|
||||
|
||||
virtgpu_shmem temp_shmem; // Local storage for large buffers
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
|
||||
if (size <= gpu->data_shmem.mmap_size) {
|
||||
// prefer the init-time allocated page, if large enough
|
||||
// Lock mutex before using shared data_shmem buffer
|
||||
if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
|
||||
}
|
||||
using_shared_shmem = true;
|
||||
shmem = &gpu->data_shmem;
|
||||
|
||||
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
|
||||
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
|
||||
}
|
||||
|
||||
apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
|
||||
@@ -98,7 +111,10 @@ void apir_buffer_get_tensor(virtgpu * gpu,
|
||||
|
||||
remote_call_finish(gpu, encoder, decoder);
|
||||
|
||||
if (shmem != &gpu->data_shmem) {
|
||||
// Unlock mutex before cleanup
|
||||
if (using_shared_shmem) {
|
||||
mtx_unlock(&gpu->data_shmem_mutex);
|
||||
} else {
|
||||
virtgpu_shmem_destroy(gpu, shmem);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,11 +2,6 @@
|
||||
#include "virtgpu-shm.h"
|
||||
|
||||
int apir_device_get_count(virtgpu * gpu) {
|
||||
static int32_t dev_count = -1;
|
||||
if (dev_count != -1) {
|
||||
return dev_count;
|
||||
}
|
||||
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
@@ -14,6 +9,7 @@ int apir_device_get_count(virtgpu * gpu) {
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
|
||||
REMOTE_CALL(gpu, encoder, decoder, ret);
|
||||
|
||||
int32_t dev_count = -1;
|
||||
apir_decode_int32_t(decoder, &dev_count);
|
||||
|
||||
remote_call_finish(gpu, encoder, decoder);
|
||||
@@ -21,11 +17,7 @@ int apir_device_get_count(virtgpu * gpu) {
|
||||
return dev_count;
|
||||
}
|
||||
|
||||
const char * apir_device_get_name(virtgpu * gpu) {
|
||||
static char * string = nullptr;
|
||||
if (string) {
|
||||
return string;
|
||||
}
|
||||
char * apir_device_get_name(virtgpu * gpu) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
@@ -34,9 +26,9 @@ const char * apir_device_get_name(virtgpu * gpu) {
|
||||
REMOTE_CALL(gpu, encoder, decoder, ret);
|
||||
|
||||
const size_t string_size = apir_decode_array_size_unchecked(decoder);
|
||||
string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
|
||||
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
|
||||
if (!string) {
|
||||
GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
apir_decode_char_array(decoder, string, string_size);
|
||||
@@ -46,7 +38,7 @@ const char * apir_device_get_name(virtgpu * gpu) {
|
||||
return string;
|
||||
}
|
||||
|
||||
const char * apir_device_get_description(virtgpu * gpu) {
|
||||
char * apir_device_get_description(virtgpu * gpu) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
@@ -58,7 +50,7 @@ const char * apir_device_get_description(virtgpu * gpu) {
|
||||
const size_t string_size = apir_decode_array_size_unchecked(decoder);
|
||||
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
|
||||
if (!string) {
|
||||
GGML_LOG_ERROR("%s: Could not allocate the device description buffer\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device description buffer\n", __func__);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@@ -181,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
|
||||
|
||||
if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) {
|
||||
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
|
||||
GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer");
|
||||
}
|
||||
|
||||
apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id);
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
int32_t forward_flag = (int32_t) apir_command_type__; \
|
||||
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \
|
||||
if (!encoder_name) { \
|
||||
GGML_ABORT("%s: failed to prepare the remote call encoder", __func__); \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -19,10 +19,10 @@
|
||||
do { \
|
||||
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
|
||||
if (!decoder_name) { \
|
||||
GGML_ABORT("%s: failed to kick the remote call", __func__); \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
|
||||
} \
|
||||
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
|
||||
GGML_ABORT("%s: failed to forward the API call: %s: code %d", __func__, \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
|
||||
apir_forward_error(ret_name), ret_name); \
|
||||
} \
|
||||
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
/* device */
|
||||
void apir_device_get_device_count(struct virtgpu * gpu);
|
||||
int apir_device_get_count(struct virtgpu * gpu);
|
||||
const char * apir_device_get_name(struct virtgpu * gpu);
|
||||
const char * apir_device_get_description(struct virtgpu * gpu);
|
||||
char * apir_device_get_name(struct virtgpu * gpu);
|
||||
char * apir_device_get_description(struct virtgpu * gpu);
|
||||
uint32_t apir_device_get_type(struct virtgpu * gpu);
|
||||
void apir_device_get_memory(struct virtgpu * gpu, size_t * free, size_t * total);
|
||||
bool apir_device_supports_op(struct virtgpu * gpu, const ggml_tensor * op);
|
||||
@@ -17,14 +17,15 @@ void apir_device_get_props(struct virtgpu * gpu,
|
||||
apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu, size_t size, size_t max_tensor_size);
|
||||
|
||||
/* buffer-type */
|
||||
const char * apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
|
||||
size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
|
||||
size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
|
||||
bool apir_buffer_type_is_host(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
|
||||
apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
|
||||
ggml_backend_buffer_type_t buffer_buft,
|
||||
size_t size);
|
||||
size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op);
|
||||
char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
|
||||
size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
|
||||
size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
|
||||
apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
|
||||
apir_buffer_type_host_handle_t host_handle,
|
||||
size_t size);
|
||||
size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu,
|
||||
apir_buffer_type_host_handle_t host_handle,
|
||||
const ggml_tensor * op);
|
||||
|
||||
/* buffer */
|
||||
void * apir_buffer_get_base(struct virtgpu * gpu, apir_buffer_context_t * buffer_context);
|
||||
|
||||
@@ -85,8 +85,7 @@ int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem) {
|
||||
void * ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
|
||||
if (!ptr) {
|
||||
virtgpu_ioctl_gem_close(gpu, gem_handle);
|
||||
GGML_LOG_ERROR("virtgpu_ioctl_map FAILED\n");
|
||||
exit(1);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: virtgpu_ioctl_map failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ static int virtgpu_handshake(virtgpu * gpu) {
|
||||
|
||||
encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HANDSHAKE, 0);
|
||||
if (!encoder) {
|
||||
GGML_ABORT("%s: failed to prepare the remote call encoder", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ static int virtgpu_handshake(virtgpu * gpu) {
|
||||
log_call_duration(call_duration_ns, "API Remoting handshake");
|
||||
|
||||
if (!decoder) {
|
||||
GGML_ABORT(
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initiate the communication with the virglrenderer library. "
|
||||
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
|
||||
__func__);
|
||||
@@ -65,7 +65,8 @@ static int virtgpu_handshake(virtgpu * gpu) {
|
||||
uint32_t host_minor;
|
||||
|
||||
if (ret_magic != APIR_HANDSHAKE_MAGIC) {
|
||||
GGML_ABORT("%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
|
||||
apir_backend_initialize_error(ret_magic));
|
||||
} else {
|
||||
apir_decode_uint32_t(decoder, &host_major);
|
||||
@@ -78,13 +79,13 @@ static int virtgpu_handshake(virtgpu * gpu) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor);
|
||||
GGML_LOG_INFO("%s: Host is running with %u.%u\n", __func__, host_major, host_minor);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "%s: Host is running with %u.%u\n", __func__, host_major, host_minor);
|
||||
|
||||
if (guest_major != host_major) {
|
||||
GGML_LOG_ERROR("Host major (%d) and guest major (%d) version differ\n", host_major, guest_major);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "Host major (%d) and guest major (%d) version differ\n", host_major, guest_major);
|
||||
} else if (guest_minor != host_minor) {
|
||||
GGML_LOG_WARN("Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor);
|
||||
GGML_LOG_WARN(GGML_VIRTGPU "Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -97,7 +98,7 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
|
||||
|
||||
encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LOADLIBRARY, 0);
|
||||
if (!encoder) {
|
||||
GGML_ABORT("%s: hypercall error: failed to prepare the remote call encoder", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: hypercall error: failed to prepare the API Remoting command encoder", __func__);
|
||||
return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
|
||||
}
|
||||
|
||||
@@ -108,36 +109,67 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
|
||||
log_call_duration(call_duration_ns, "API Remoting LoadLibrary");
|
||||
|
||||
if (!decoder) {
|
||||
GGML_ABORT("%s: hypercall error: failed to kick the API remoting hypercall.\n", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: hypercall error: failed to trigger the API Remoting hypercall.\n", __func__);
|
||||
return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
|
||||
}
|
||||
|
||||
remote_call_finish(gpu, encoder, decoder);
|
||||
|
||||
if (ret == APIR_LOAD_LIBRARY_SUCCESS) {
|
||||
GGML_LOG_INFO("%s: The API Remoting backend was successfully loaded and initialized\n", __func__);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "The API Remoting backend was successfully loaded and initialized\n");
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// something wrong happened, find out what.
|
||||
|
||||
if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
|
||||
GGML_ABORT("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", __func__,
|
||||
apir_load_library_error(ret), ret);
|
||||
if (ret == APIR_LOAD_LIBRARY_ENV_VAR_MISSING) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: virglrenderer could not open the API Remoting backend library, "
|
||||
"some environment variables are missing. "
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
|
||||
__func__, apir_load_library_error(ret));
|
||||
} else if (ret == APIR_LOAD_LIBRARY_CANNOT_OPEN) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: virglrenderer could not open the API Remoting backend library. "
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
|
||||
__func__, apir_load_library_error(ret));
|
||||
} else if (ret == APIR_LOAD_LIBRARY_ENV_VAR_MISSING) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: could not load the backend library, some symbols are missing. "
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s) ",
|
||||
__func__, apir_load_library_error(ret));
|
||||
} else {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__,
|
||||
apir_load_library_error(ret), ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
|
||||
|
||||
ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
|
||||
|
||||
if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
|
||||
GGML_ABORT("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s)",
|
||||
if (apir_ret == APIR_LOAD_LIBRARY_CANNOT_OPEN) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library couldn't load the GGML backend library. "
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
|
||||
__func__, apir_load_library_error(apir_ret));
|
||||
} else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
|
||||
__func__, apir_load_library_error(apir_ret));
|
||||
} else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)",
|
||||
__func__, apir_ret, apir_load_library_error(apir_ret));
|
||||
} else {
|
||||
uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
|
||||
GGML_ABORT("%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
|
||||
lib_ret);
|
||||
}
|
||||
return ret;
|
||||
@@ -149,38 +181,58 @@ virtgpu * create_virtgpu() {
|
||||
gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr;
|
||||
util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024);
|
||||
|
||||
// Initialize mutex to protect shared data_shmem buffer
|
||||
if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) {
|
||||
delete gpu;
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize data_shmem mutex", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_open(gpu) != APIR_SUCCESS) {
|
||||
GGML_ABORT("%s: failed to open the virtgpu device", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to open the virtgpu device\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
|
||||
GGML_ABORT("%s: failed to initialize the GPU capset", __func__);
|
||||
if (gpu->use_apir_capset) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__);
|
||||
} else {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize the virtgpu Venus capset", __func__);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
|
||||
GGML_ABORT("%s: failed to initialize the GPU context", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize the GPU context", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
|
||||
GGML_ABORT("%s: failed to create the shared reply memory pages", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to create the shared reply memory pages", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
|
||||
GGML_ABORT("%s: failed to create the shared data memory pages", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to create the shared data memory pages", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_handshake(gpu)) {
|
||||
GGML_ABORT("%s: failed to handshake with the virglrenderer library", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to handshake with the virglrenderer library", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
|
||||
GGML_ABORT("%s: failed to load the backend library", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to load the backend library", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -191,7 +243,8 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
|
||||
drmDevicePtr devs[8];
|
||||
int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
|
||||
if (count < 0) {
|
||||
GGML_LOG_ERROR("%s: failed to enumerate DRM devices\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to enumerate DRM devices\n", __func__);
|
||||
return APIR_ERROR_INITIALIZATION_FAILED;
|
||||
}
|
||||
|
||||
@@ -213,16 +266,19 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d
|
||||
|
||||
int fd = open(node_path, O_RDWR | O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
GGML_ABORT("failed to open %s", node_path);
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to open %s", __func__, node_path);
|
||||
return APIR_ERROR_INITIALIZATION_FAILED;
|
||||
}
|
||||
|
||||
drmVersionPtr version = drmGetVersion(fd);
|
||||
if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
|
||||
if (version) {
|
||||
GGML_ABORT("unknown DRM driver %s version %d", version->name, version->version_major);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major);
|
||||
} else {
|
||||
GGML_ABORT("failed to get DRM driver version");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to get DRM driver version\n", __func__);
|
||||
}
|
||||
|
||||
if (version) {
|
||||
@@ -236,7 +292,7 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d
|
||||
|
||||
drmFreeVersion(version);
|
||||
|
||||
GGML_LOG_INFO("using DRM device %s\n", node_path);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "using DRM device %s\n", node_path);
|
||||
|
||||
return APIR_SUCCESS;
|
||||
}
|
||||
@@ -245,7 +301,7 @@ static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) {
|
||||
assert(!gpu->capset.version);
|
||||
const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
|
||||
if (ret) {
|
||||
GGML_LOG_INFO("failed to initialize context: %s\n", strerror(errno));
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to initialize context: %s\n", __func__, strerror(errno));
|
||||
return APIR_ERROR_INITIALIZATION_FAILED;
|
||||
}
|
||||
|
||||
@@ -254,10 +310,10 @@ static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) {
|
||||
|
||||
static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
|
||||
if (gpu->use_apir_capset) {
|
||||
GGML_LOG_INFO("Using the APIR capset\n");
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "Using the APIR capset\n");
|
||||
gpu->capset.id = VIRTGPU_DRM_CAPSET_APIR;
|
||||
} else {
|
||||
GGML_LOG_INFO("Using the Venus capset\n");
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "Using the Venus capset\n");
|
||||
gpu->capset.id = VIRTGPU_DRM_CAPSET_VENUS;
|
||||
}
|
||||
gpu->capset.version = 0;
|
||||
@@ -266,7 +322,9 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
|
||||
virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
|
||||
|
||||
if (ret) {
|
||||
GGML_LOG_INFO("failed to get APIR v%d capset: %s\n", gpu->capset.version, strerror(errno));
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to get APIR v%d capset: %s\n",
|
||||
__func__, gpu->capset.version, strerror(errno));
|
||||
return APIR_ERROR_INITIALIZATION_FAILED;
|
||||
}
|
||||
|
||||
@@ -333,9 +391,9 @@ apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type,
|
||||
* Prepare the command encoder and its buffer
|
||||
*/
|
||||
|
||||
static char encoder_buffer[4096];
|
||||
thread_local char encoder_buffer[4096];
|
||||
|
||||
static apir_encoder enc;
|
||||
thread_local apir_encoder enc;
|
||||
enc = {
|
||||
.cur = encoder_buffer,
|
||||
.start = encoder_buffer,
|
||||
@@ -369,19 +427,19 @@ void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec) {
|
||||
UNUSED(gpu);
|
||||
|
||||
if (!enc) {
|
||||
GGML_LOG_ERROR("Invalid (null) encoder\n");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Invalid (null) encoder", __func__);
|
||||
}
|
||||
|
||||
if (!dec) {
|
||||
GGML_LOG_ERROR("Invalid (null) decoder\n");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Invalid (null) decoder", __func__);
|
||||
}
|
||||
|
||||
if (apir_encoder_get_fatal(enc)) {
|
||||
GGML_LOG_ERROR("Failed to encode the output parameters.\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Failed to encode the output parameters.", __func__);
|
||||
}
|
||||
|
||||
if (apir_decoder_get_fatal(dec)) {
|
||||
GGML_LOG_ERROR("Failed to decode the input parameters.\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Failed to decode the input parameters.", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -423,7 +481,7 @@ uint32_t remote_call(virtgpu * gpu,
|
||||
int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
|
||||
|
||||
if (ret != 0) {
|
||||
GGML_ABORT("%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -467,7 +525,7 @@ uint32_t remote_call(virtgpu * gpu,
|
||||
}
|
||||
|
||||
if (max_wait_ms && timedout) {
|
||||
GGML_LOG_ERROR("timed out waiting for the host answer...\n");
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: timed out waiting for the host answer...\n", __func__);
|
||||
return APIR_FORWARD_TIMEOUT;
|
||||
}
|
||||
|
||||
@@ -489,10 +547,13 @@ static void log_call_duration(long long call_duration_ns, const char * name) {
|
||||
double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds
|
||||
|
||||
if (call_duration_s > 1) {
|
||||
GGML_LOG_INFO("%s: waited %.2fs for the %s host reply...\n", __func__, call_duration_s, name);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"waited %.2fs for the %s host reply...\n", call_duration_s, name);
|
||||
} else if (call_duration_ms > 1) {
|
||||
GGML_LOG_INFO("%s: waited %.2fms for the %s host reply...\n", __func__, call_duration_ms, name);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"waited %.2fms for the %s host reply...\n", call_duration_ms, name);
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: waited %lldns for the %s host reply...\n", __func__, call_duration_ns, name);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"waited %lldns for the %s host reply...\n", call_duration_ns, name);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include "ggml-remoting.h"
|
||||
|
||||
#define VIRGL_RENDERER_UNSTABLE_APIS 1
|
||||
#include "apir_hw.h"
|
||||
#include <drm/virtgpu_drm.h>
|
||||
@@ -73,6 +75,27 @@ struct virtgpu {
|
||||
/* APIR communication pages */
|
||||
virtgpu_shmem reply_shmem;
|
||||
virtgpu_shmem data_shmem;
|
||||
|
||||
/* Mutex to protect shared data_shmem buffer from concurrent access */
|
||||
mtx_t data_shmem_mutex;
|
||||
|
||||
/* Cached device information to prevent memory leaks and race conditions */
|
||||
struct {
|
||||
char * description;
|
||||
char * name;
|
||||
int32_t device_count;
|
||||
uint32_t type;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
} cached_device_info;
|
||||
|
||||
/* Cached buffer type information to prevent memory leaks and race conditions */
|
||||
struct {
|
||||
apir_buffer_type_host_handle_t host_handle;
|
||||
char * name;
|
||||
size_t alignment;
|
||||
size_t max_size;
|
||||
} cached_buffer_type;
|
||||
};
|
||||
|
||||
static inline int virtgpu_ioctl(virtgpu * gpu, unsigned long request, void * args) {
|
||||
|
||||
@@ -254,6 +254,7 @@ enum vk_device_architecture {
|
||||
AMD_RDNA3,
|
||||
INTEL_XE2,
|
||||
NVIDIA_PRE_TURING,
|
||||
NVIDIA_TURING,
|
||||
};
|
||||
|
||||
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
|
||||
@@ -336,18 +337,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool cooperative_matrix = false;
|
||||
bool sm_builtins = false;
|
||||
|
||||
// Detect "pre-turing" based on lack of coopmat support.
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
|
||||
cooperative_matrix = true;
|
||||
break;
|
||||
} else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) {
|
||||
sm_builtins = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!cooperative_matrix) {
|
||||
return vk_device_architecture::NVIDIA_PRE_TURING;
|
||||
}
|
||||
|
||||
if (sm_builtins) {
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
|
||||
|
||||
props2.pNext = &sm_props;
|
||||
|
||||
device.getProperties2(&props2);
|
||||
|
||||
// Turing has 32, following architectures have 48
|
||||
if (sm_props.shaderWarpsPerSM == 32) {
|
||||
return vk_device_architecture::NVIDIA_TURING;
|
||||
}
|
||||
}
|
||||
}
|
||||
return vk_device_architecture::OTHER;
|
||||
}
|
||||
@@ -8460,6 +8477,11 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
FaCodePath path = ctx->device->coopmat2 ? FA_COOPMAT2 :
|
||||
ctx->device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
|
||||
|
||||
if (path == FA_COOPMAT1 && ctx->device->architecture == vk_device_architecture::NVIDIA_TURING) {
|
||||
// Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090
|
||||
path = FA_SCALAR;
|
||||
}
|
||||
|
||||
if (path == FA_COOPMAT1) {
|
||||
const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) ||
|
||||
(dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc);
|
||||
|
||||
@@ -7517,8 +7517,11 @@ void ggml_quantize_free(void) {
|
||||
|
||||
iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ2_XS);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ2_S);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ1_S);
|
||||
iq2xs_free_impl(GGML_TYPE_IQ1_M);
|
||||
iq3xs_free_impl(256);
|
||||
iq3xs_free_impl(512);
|
||||
|
||||
ggml_critical_section_end();
|
||||
}
|
||||
|
||||
@@ -12,8 +12,8 @@ vendor = {
|
||||
# "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h",
|
||||
"https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",
|
||||
|
||||
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.1/httplib.h": "vendor/cpp-httplib/httplib.h",
|
||||
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.1/LICENSE": "vendor/cpp-httplib/LICENSE",
|
||||
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/httplib.h": "vendor/cpp-httplib/httplib.h",
|
||||
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/LICENSE": "vendor/cpp-httplib/LICENSE",
|
||||
|
||||
"https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h",
|
||||
}
|
||||
|
||||
@@ -1027,11 +1027,7 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
||||
llama_sampler_chain_n(sampler) > 0;
|
||||
|
||||
if (sampler && can_offload) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
|
||||
auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
|
||||
if (host_buft) {
|
||||
buft = host_buft;
|
||||
}
|
||||
auto * buft = ggml_backend_dev_buffer_type(model.dev_output());
|
||||
|
||||
sampler->iface->backend_init(sampler, buft);
|
||||
|
||||
|
||||
+13
-6
@@ -2419,6 +2419,9 @@ void llm_graph_context::build_sampling() const {
|
||||
return;
|
||||
}
|
||||
|
||||
std::array<ggml_tensor *, 2> outs;
|
||||
outs[0] = res->t_logits;
|
||||
|
||||
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
|
||||
res->add_input(std::move(inp_sampling));
|
||||
|
||||
@@ -2439,14 +2442,14 @@ void llm_graph_context::build_sampling() const {
|
||||
// add a dummy row of logits
|
||||
// this trick makes the graph static, regardless of which samplers are activated
|
||||
// this is important in order to minimize graph reallocations
|
||||
// TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
|
||||
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
|
||||
|
||||
for (const auto & [seq_id, sampler] : samplers) {
|
||||
const auto it = seq_to_logit_row.find(seq_id);
|
||||
|
||||
// inactive samplers always work on the first row
|
||||
const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
|
||||
const auto row_idx = it != seq_to_logit_row.end() ? it->second : 0;
|
||||
const int i_out = it != seq_to_logit_row.end() ? 1 : 0;
|
||||
|
||||
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
|
||||
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
|
||||
@@ -2463,22 +2466,26 @@ void llm_graph_context::build_sampling() const {
|
||||
|
||||
if (data.sampled != nullptr) {
|
||||
res->t_sampled[seq_id] = data.sampled;
|
||||
ggml_build_forward_expand(gf, data.sampled);
|
||||
outs[1] = data.sampled;
|
||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||
}
|
||||
|
||||
if (data.probs != nullptr) {
|
||||
res->t_sampled_probs[seq_id] = data.probs;
|
||||
ggml_build_forward_expand(gf, data.probs);
|
||||
outs[1] = data.probs;
|
||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||
}
|
||||
|
||||
if (data.logits != nullptr) {
|
||||
res->t_sampled_logits[seq_id] = data.logits;
|
||||
ggml_build_forward_expand(gf, data.logits);
|
||||
outs[1] = data.logits;
|
||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||
}
|
||||
|
||||
if (data.candidates != nullptr) {
|
||||
res->t_candidates[seq_id] = data.candidates;
|
||||
ggml_build_forward_expand(gf, data.candidates);
|
||||
outs[1] = data.candidates;
|
||||
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+19
-62
@@ -1025,11 +1025,7 @@ struct llama_sampler_dist : public llama_sampler_backend {
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
// backend input
|
||||
struct ggml_tensor * inp_uniform;
|
||||
|
||||
ggml_context_ptr inp_ctx;
|
||||
ggml_backend_buffer_ptr inp_buf;
|
||||
ggml_tensor * inp_uniform;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) {
|
||||
@@ -1138,37 +1134,10 @@ static bool llama_sampler_dist_backend_init(
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
// allocate inputs
|
||||
{
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
sctx->inp_ctx.reset(ggml_init(params));
|
||||
|
||||
// Create the uniform random scalar input tensor. This will be set by
|
||||
// llama_sampler_dist_backend_set_input after this graph is built.
|
||||
sctx->inp_uniform = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1);
|
||||
ggml_set_name (sctx->inp_uniform, "uniform");
|
||||
ggml_set_input(sctx->inp_uniform);
|
||||
|
||||
// Allocate all tensors from our context to the backend
|
||||
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
||||
|
||||
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
||||
}
|
||||
|
||||
const bool res = llama_sampler_backend_support(smpl, buft);
|
||||
|
||||
sctx->init(res);
|
||||
|
||||
if (!res) {
|
||||
sctx->inp_ctx.reset(nullptr);
|
||||
sctx->inp_buf.reset(nullptr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -1178,8 +1147,13 @@ static void llama_sampler_dist_backend_apply(
|
||||
struct ggml_cgraph * gf,
|
||||
struct llama_sampler_data * data) {
|
||||
GGML_UNUSED(gf);
|
||||
|
||||
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||
ggml_set_name (sctx->inp_uniform, "uniform");
|
||||
ggml_set_input(sctx->inp_uniform);
|
||||
|
||||
struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
|
||||
ggml_set_name(probs, "dist_probs");
|
||||
|
||||
@@ -1226,6 +1200,7 @@ static void llama_sampler_dist_backend_apply(
|
||||
|
||||
static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) {
|
||||
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
||||
|
||||
GGML_ASSERT(sctx->inp_uniform != nullptr);
|
||||
|
||||
// We sample in double precision and cast to float to match rnd numbers of
|
||||
@@ -1262,8 +1237,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
/* .inp_uniform = */ nullptr,
|
||||
/* .inp_ctx = */ nullptr,
|
||||
/* .inp_buf = */ nullptr,
|
||||
}
|
||||
);
|
||||
}
|
||||
@@ -3461,9 +3434,6 @@ struct llama_sampler_logit_bias : public llama_sampler_backend {
|
||||
|
||||
struct ggml_tensor * inp_logit_bias;
|
||||
struct ggml_tensor * inp_logit_idxs;
|
||||
|
||||
ggml_context_ptr inp_ctx;
|
||||
ggml_backend_buffer_ptr inp_buf;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) {
|
||||
@@ -3526,6 +3496,16 @@ static void llama_sampler_logit_bias_backend_apply(
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n = sctx->logit_bias.size();
|
||||
|
||||
sctx->inp_logit_bias = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n);
|
||||
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
|
||||
ggml_set_input(sctx->inp_logit_bias);
|
||||
|
||||
sctx->inp_logit_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n);
|
||||
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
|
||||
ggml_set_input(sctx->inp_logit_idxs);
|
||||
|
||||
ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f);
|
||||
|
||||
cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur));
|
||||
@@ -3562,6 +3542,8 @@ static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * sm
|
||||
static bool llama_sampler_logit_bias_backend_init(
|
||||
struct llama_sampler * smpl,
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(buft);
|
||||
|
||||
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
||||
|
||||
sctx->init(true);
|
||||
@@ -3570,29 +3552,6 @@ static bool llama_sampler_logit_bias_backend_init(
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ 2*ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
sctx->inp_ctx.reset(ggml_init(params));
|
||||
|
||||
const size_t n = sctx->logit_bias.size();
|
||||
|
||||
sctx->inp_logit_bias = ggml_new_tensor_2d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1, n);
|
||||
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
|
||||
ggml_set_input(sctx->inp_logit_bias);
|
||||
|
||||
sctx->inp_logit_idxs = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_I32, n);
|
||||
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
|
||||
ggml_set_input(sctx->inp_logit_idxs);
|
||||
|
||||
// Allocate all tensors from our context to the backend
|
||||
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
||||
|
||||
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3628,8 +3587,6 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
/* .to_search = */ {},
|
||||
/* .inp_logit_bias = */ nullptr,
|
||||
/* .inp_logit_idxs = */ nullptr,
|
||||
/* .inp_ctx = */ nullptr,
|
||||
/* .inp_buf = */ nullptr,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2262,6 +2262,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<PRE>"
|
||||
|| t.first == "▁<PRE>" // CodeLlama
|
||||
|| t.first == "<|code_prefix|>" // GLM-4.5
|
||||
|| t.first == "<|prefix|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_pre_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
@@ -2282,6 +2283,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<SUF>"
|
||||
|| t.first == "▁<SUF>" // CodeLlama
|
||||
|| t.first == "<|code_suffix|>" // GLM-4.5
|
||||
|| t.first == "<|suffix|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_suf_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
@@ -2302,6 +2304,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<MID>"
|
||||
|| t.first == "▁<MID>" // CodeLlama
|
||||
|| t.first == "<|code_middle|>" // GLM-4.5
|
||||
|| t.first == "<|middle|>" // Falcon-H1-Tiny-Coder
|
||||
) {
|
||||
special_fim_mid_id = t.second;
|
||||
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
||||
@@ -43,7 +43,7 @@ llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv));
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = build_norm(Qcur,
|
||||
|
||||
@@ -265,9 +265,15 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
|
||||
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
|
||||
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
|
||||
ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
|
||||
1, chunk_size, n_chunks, g_diff_exp->ne[3]);
|
||||
|
||||
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
|
||||
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
|
||||
|
||||
ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
|
||||
cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
|
||||
|
||||
|
||||
// state to be updated per chunk
|
||||
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
|
||||
@@ -322,9 +328,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
|
||||
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
|
||||
|
||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
||||
ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
|
||||
ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
|
||||
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
|
||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
|
||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
|
||||
|
||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
|
||||
|
||||
@@ -8032,6 +8032,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (int mode : {GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION}) {
|
||||
for (bool ff : {false, true}) {
|
||||
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 0, true, true));
|
||||
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true));
|
||||
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 3}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -674,15 +674,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
if (!embd.empty()) {
|
||||
int n_eval = (int) embd.size();
|
||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
||||
GGML_ASSERT(n_eval <= params.n_batch);
|
||||
if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -743,7 +740,7 @@ int main(int argc, char ** argv) {
|
||||
common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
if ((int) embd.size() == params.n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Vendored
+190
-48
@@ -117,6 +117,8 @@ time_t parse_http_date(const std::string &date_str) {
|
||||
|
||||
#ifdef _WIN32
|
||||
return _mkgmtime(&tm_buf);
|
||||
#elif defined _AIX
|
||||
return mktime(&tm_buf);
|
||||
#else
|
||||
return timegm(&tm_buf);
|
||||
#endif
|
||||
@@ -1376,7 +1378,7 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
|
||||
|
||||
// Allocate on the heap, so the resolver thread can keep using the data.
|
||||
auto state = std::make_shared<GetAddrInfoState>();
|
||||
state->node = node;
|
||||
if (node) { state->node = node; }
|
||||
state->service = service;
|
||||
state->hints = *hints;
|
||||
|
||||
@@ -2896,10 +2898,20 @@ bool parse_range_header(const std::string &s, Ranges &ranges) try {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto first =
|
||||
static_cast<ssize_t>(lhs.empty() ? -1 : std::stoll(lhs));
|
||||
const auto last =
|
||||
static_cast<ssize_t>(rhs.empty() ? -1 : std::stoll(rhs));
|
||||
ssize_t first = -1;
|
||||
if (!lhs.empty()) {
|
||||
ssize_t v;
|
||||
auto res = detail::from_chars(lhs.data(), lhs.data() + lhs.size(), v);
|
||||
if (res.ec == std::errc{}) { first = v; }
|
||||
}
|
||||
|
||||
ssize_t last = -1;
|
||||
if (!rhs.empty()) {
|
||||
ssize_t v;
|
||||
auto res = detail::from_chars(rhs.data(), rhs.data() + rhs.size(), v);
|
||||
if (res.ec == std::errc{}) { last = v; }
|
||||
}
|
||||
|
||||
if ((first == -1 && last == -1) ||
|
||||
(first != -1 && last != -1 && first > last)) {
|
||||
all_valid_ranges = false;
|
||||
@@ -2974,25 +2986,17 @@ bool parse_accept_header(const std::string &s,
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef CPPHTTPLIB_NO_EXCEPTIONS
|
||||
{
|
||||
std::istringstream iss(quality_str);
|
||||
iss >> accept_entry.quality;
|
||||
|
||||
// Check if conversion was successful and entire string was consumed
|
||||
if (iss.fail() || !iss.eof()) {
|
||||
double v = 0.0;
|
||||
auto res = detail::from_chars(
|
||||
quality_str.data(), quality_str.data() + quality_str.size(), v);
|
||||
if (res.ec == std::errc{}) {
|
||||
accept_entry.quality = v;
|
||||
} else {
|
||||
has_invalid_entry = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
#else
|
||||
try {
|
||||
accept_entry.quality = std::stod(quality_str);
|
||||
} catch (...) {
|
||||
has_invalid_entry = true;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// Check if quality is in valid range [0.0, 1.0]
|
||||
if (accept_entry.quality < 0.0 || accept_entry.quality > 1.0) {
|
||||
has_invalid_entry = true;
|
||||
@@ -5570,13 +5574,26 @@ bool Server::read_content(Stream &strm, Request &req, Response &res) {
|
||||
strm, req, res,
|
||||
// Regular
|
||||
[&](const char *buf, size_t n) {
|
||||
// Prevent arithmetic overflow when checking sizes.
|
||||
// Avoid computing (req.body.size() + n) directly because
|
||||
// adding two unsigned `size_t` values can wrap around and
|
||||
// produce a small result instead of indicating overflow.
|
||||
// Instead, check using subtraction: ensure `n` does not
|
||||
// exceed the remaining capacity `max_size() - size()`.
|
||||
if (req.body.size() >= req.body.max_size() ||
|
||||
n > req.body.max_size() - req.body.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Limit decompressed body size to payload_max_length_ to protect
|
||||
// against "zip bomb" attacks where a small compressed payload
|
||||
// decompresses to a massive size.
|
||||
if (req.body.size() + n > payload_max_length_ ||
|
||||
req.body.size() + n > req.body.max_size()) {
|
||||
if (payload_max_length_ > 0 &&
|
||||
(req.body.size() >= payload_max_length_ ||
|
||||
n > payload_max_length_ - req.body.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
req.body.append(buf, n);
|
||||
return true;
|
||||
},
|
||||
@@ -5666,22 +5683,29 @@ bool Server::read_content_core(
|
||||
// oversized request and fail early (causing connection close). For SSL
|
||||
// builds we cannot reliably peek the decrypted application bytes, so keep
|
||||
// the original behaviour.
|
||||
#if !defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(_WIN32)
|
||||
#if !defined(CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||
if (!req.has_header("Content-Length") &&
|
||||
!detail::is_chunked_transfer_encoding(req.headers)) {
|
||||
socket_t s = strm.socket();
|
||||
if (s != INVALID_SOCKET) {
|
||||
// Peek up to payload_max_length_ + 1 bytes. If more than
|
||||
// payload_max_length_ bytes are pending, reject the request.
|
||||
size_t to_peek =
|
||||
(payload_max_length_ > 0)
|
||||
? (std::min)(payload_max_length_ + 1, static_cast<size_t>(4096))
|
||||
: 1;
|
||||
std::vector<char> peekbuf(to_peek);
|
||||
ssize_t n = ::recv(s, peekbuf.data(), to_peek, MSG_PEEK);
|
||||
if (n > 0 && static_cast<size_t>(n) > payload_max_length_) {
|
||||
// Indicate failure so connection will be closed.
|
||||
return false;
|
||||
// Only peek if payload_max_length is set to a finite value
|
||||
if (payload_max_length_ > 0 &&
|
||||
payload_max_length_ < (std::numeric_limits<size_t>::max)()) {
|
||||
socket_t s = strm.socket();
|
||||
if (s != INVALID_SOCKET) {
|
||||
// Peek to check if there is any pending data
|
||||
char peekbuf[1];
|
||||
ssize_t n = ::recv(s, peekbuf, 1, MSG_PEEK);
|
||||
if (n > 0) {
|
||||
// There is data, so read it with payload limit enforcement
|
||||
auto result = detail::read_content_without_length(
|
||||
strm, payload_max_length_, out);
|
||||
if (result == detail::ReadContentResult::PayloadTooLarge) {
|
||||
res.status = StatusCode::PayloadTooLarge_413;
|
||||
return false;
|
||||
} else if (result != detail::ReadContentResult::Success) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@@ -6656,7 +6680,8 @@ void ClientImpl::close_socket(Socket &socket) {
|
||||
}
|
||||
|
||||
bool ClientImpl::read_response_line(Stream &strm, const Request &req,
|
||||
Response &res) const {
|
||||
Response &res,
|
||||
bool skip_100_continue) const {
|
||||
std::array<char, 2048> buf{};
|
||||
|
||||
detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
|
||||
@@ -6677,8 +6702,8 @@ bool ClientImpl::read_response_line(Stream &strm, const Request &req,
|
||||
res.status = std::stoi(std::string(m[2]));
|
||||
res.reason = std::string(m[3]);
|
||||
|
||||
// Ignore '100 Continue'
|
||||
while (res.status == StatusCode::Continue_100) {
|
||||
// Ignore '100 Continue' (only when not using Expect: 100-continue explicitly)
|
||||
while (skip_100_continue && res.status == StatusCode::Continue_100) {
|
||||
if (!line_reader.getline()) { return false; } // CRLF
|
||||
if (!line_reader.getline()) { return false; } // next response line
|
||||
|
||||
@@ -7463,7 +7488,8 @@ bool ClientImpl::write_content_with_provider(Stream &strm,
|
||||
}
|
||||
|
||||
bool ClientImpl::write_request(Stream &strm, Request &req,
|
||||
bool close_connection, Error &error) {
|
||||
bool close_connection, Error &error,
|
||||
bool skip_body) {
|
||||
// Prepare additional headers
|
||||
if (close_connection) {
|
||||
if (!req.has_header("Connection")) {
|
||||
@@ -7582,7 +7608,59 @@ bool ClientImpl::write_request(Stream &strm, Request &req,
|
||||
}
|
||||
}
|
||||
|
||||
// After sending request line and headers, wait briefly for an early server
|
||||
// response (e.g. 4xx) and avoid sending a potentially large request body
|
||||
// unnecessarily. This workaround is only enabled on Windows because Unix
|
||||
// platforms surface write errors (EPIPE) earlier; on Windows kernel send
|
||||
// buffering can accept large writes even when the peer already responded.
|
||||
// Check the stream first (which covers SSL via `is_readable()`), then
|
||||
// fall back to select on the socket. Only perform the wait for very large
|
||||
// request bodies to avoid interfering with normal small requests and
|
||||
// reduce side-effects. Poll briefly (up to 50ms as default) for an early
|
||||
// response. Skip this check when using Expect: 100-continue, as the protocol
|
||||
// handles early responses properly.
|
||||
#if defined(_WIN32)
|
||||
if (!skip_body &&
|
||||
req.body.size() > CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD &&
|
||||
req.path.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) {
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (;;) {
|
||||
// Prefer socket-level readiness to avoid SSL_pending() false-positives
|
||||
// from SSL internals. If the underlying socket is readable, assume an
|
||||
// early response may be present.
|
||||
auto sock = strm.socket();
|
||||
if (sock != INVALID_SOCKET && detail::select_read(sock, 0, 0) > 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fallback to stream-level check for non-socket streams or when the
|
||||
// socket isn't reporting readable. Avoid using `is_readable()` for
|
||||
// SSL, since `SSL_pending()` may report buffered records that do not
|
||||
// indicate a complete application-level response yet.
|
||||
if (!is_ssl() && strm.is_readable()) { return false; }
|
||||
|
||||
auto now = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(now - start)
|
||||
.count();
|
||||
if (elapsed >= CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND) {
|
||||
break;
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Body
|
||||
if (skip_body) { return true; }
|
||||
|
||||
return write_request_body(strm, req, error);
|
||||
}
|
||||
|
||||
bool ClientImpl::write_request_body(Stream &strm, Request &req,
|
||||
Error &error) {
|
||||
if (req.body.empty()) {
|
||||
return write_content_with_provider(strm, req, error);
|
||||
}
|
||||
@@ -7758,8 +7836,20 @@ void ClientImpl::output_error_log(const Error &err,
|
||||
bool ClientImpl::process_request(Stream &strm, Request &req,
|
||||
Response &res, bool close_connection,
|
||||
Error &error) {
|
||||
// Send request
|
||||
if (!write_request(strm, req, close_connection, error)) { return false; }
|
||||
// Auto-add Expect: 100-continue for large bodies
|
||||
if (CPPHTTPLIB_EXPECT_100_THRESHOLD > 0 && !req.has_header("Expect")) {
|
||||
auto body_size = req.body.empty() ? req.content_length_ : req.body.size();
|
||||
if (body_size >= CPPHTTPLIB_EXPECT_100_THRESHOLD) {
|
||||
req.set_header("Expect", "100-continue");
|
||||
}
|
||||
}
|
||||
|
||||
// Check for Expect: 100-continue
|
||||
auto expect_100_continue = req.get_header_value("Expect") == "100-continue";
|
||||
|
||||
// Send request (skip body if using Expect: 100-continue)
|
||||
auto write_request_success =
|
||||
write_request(strm, req, close_connection, error, expect_100_continue);
|
||||
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
if (is_ssl()) {
|
||||
@@ -7774,14 +7864,48 @@ bool ClientImpl::process_request(Stream &strm, Request &req,
|
||||
}
|
||||
#endif
|
||||
|
||||
// Handle Expect: 100-continue with timeout
|
||||
if (expect_100_continue && CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND > 0) {
|
||||
time_t sec = CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND / 1000;
|
||||
time_t usec = (CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND % 1000) * 1000;
|
||||
auto ret = detail::select_read(strm.socket(), sec, usec);
|
||||
if (ret <= 0) {
|
||||
// Timeout or error: send body anyway (server didn't respond in time)
|
||||
if (!write_request_body(strm, req, error)) { return false; }
|
||||
expect_100_continue = false; // Switch to normal response handling
|
||||
}
|
||||
}
|
||||
|
||||
// Receive response and headers
|
||||
if (!read_response_line(strm, req, res) ||
|
||||
// When using Expect: 100-continue, don't auto-skip `100 Continue` response
|
||||
if (!read_response_line(strm, req, res, !expect_100_continue) ||
|
||||
!detail::read_headers(strm, res.headers)) {
|
||||
error = Error::Read;
|
||||
if (write_request_success) { error = Error::Read; }
|
||||
output_error_log(error, &req);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!write_request_success) { return false; }
|
||||
|
||||
// Handle Expect: 100-continue response
|
||||
if (expect_100_continue) {
|
||||
if (res.status == StatusCode::Continue_100) {
|
||||
// Server accepted, send the body
|
||||
if (!write_request_body(strm, req, error)) { return false; }
|
||||
|
||||
// Read the actual response
|
||||
res.headers.clear();
|
||||
res.body.clear();
|
||||
if (!read_response_line(strm, req, res) ||
|
||||
!detail::read_headers(strm, res.headers)) {
|
||||
error = Error::Read;
|
||||
output_error_log(error, &req);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// If not 100 Continue, server returned an error; proceed with that response
|
||||
}
|
||||
|
||||
// Body
|
||||
if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" &&
|
||||
req.method != "CONNECT") {
|
||||
@@ -9543,7 +9667,7 @@ bool SSLClient::load_certs() {
|
||||
last_openssl_error_ = ERR_get_error();
|
||||
ret = false;
|
||||
}
|
||||
} else {
|
||||
} else if (!ca_cert_store_) {
|
||||
auto loaded = false;
|
||||
#ifdef _WIN32
|
||||
loaded =
|
||||
@@ -9790,7 +9914,11 @@ bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
|
||||
|
||||
bool SSLClient::check_host_name(const char *pattern,
|
||||
size_t pattern_len) const {
|
||||
if (host_.size() == pattern_len && host_ == pattern) { return true; }
|
||||
// Exact match (case-insensitive)
|
||||
if (host_.size() == pattern_len &&
|
||||
detail::case_ignore::equal(host_, std::string(pattern, pattern_len))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Wildcard match
|
||||
// https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484
|
||||
@@ -9805,9 +9933,23 @@ bool SSLClient::check_host_name(const char *pattern,
|
||||
auto itr = pattern_components.begin();
|
||||
for (const auto &h : host_components_) {
|
||||
auto &p = *itr;
|
||||
if (p != h && p != "*") {
|
||||
auto partial_match = (p.size() > 0 && p[p.size() - 1] == '*' &&
|
||||
!p.compare(0, p.size() - 1, h));
|
||||
if (!httplib::detail::case_ignore::equal(p, h) && p != "*") {
|
||||
bool partial_match = false;
|
||||
if (!p.empty() && p[p.size() - 1] == '*') {
|
||||
const auto prefix_length = p.size() - 1;
|
||||
if (prefix_length == 0) {
|
||||
partial_match = true;
|
||||
} else if (h.size() >= prefix_length) {
|
||||
partial_match =
|
||||
std::equal(p.begin(),
|
||||
p.begin() + static_cast<std::string::difference_type>(
|
||||
prefix_length),
|
||||
h.begin(), [](const char ca, const char cb) {
|
||||
return httplib::detail::case_ignore::to_lower(ca) ==
|
||||
httplib::detail::case_ignore::to_lower(cb);
|
||||
});
|
||||
}
|
||||
}
|
||||
if (!partial_match) { return false; }
|
||||
}
|
||||
++itr;
|
||||
|
||||
Vendored
+93
-9
@@ -8,8 +8,8 @@
|
||||
#ifndef CPPHTTPLIB_HTTPLIB_H
|
||||
#define CPPHTTPLIB_HTTPLIB_H
|
||||
|
||||
#define CPPHTTPLIB_VERSION "0.30.1"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x001E01"
|
||||
#define CPPHTTPLIB_VERSION "0.30.2"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x001E02"
|
||||
|
||||
/*
|
||||
* Platform compatibility check
|
||||
@@ -98,6 +98,22 @@
|
||||
#define CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND 0
|
||||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_EXPECT_100_THRESHOLD
|
||||
#define CPPHTTPLIB_EXPECT_100_THRESHOLD 1024
|
||||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND
|
||||
#define CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND 1000
|
||||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD
|
||||
#define CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD (1024 * 1024)
|
||||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND
|
||||
#define CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND 50
|
||||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND
|
||||
#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0
|
||||
#endif
|
||||
@@ -286,8 +302,10 @@ using socket_t = int;
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <chrono>
|
||||
#include <climits>
|
||||
#include <condition_variable>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <errno.h>
|
||||
#include <exception>
|
||||
@@ -305,6 +323,7 @@ using socket_t = int;
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <sys/stat.h>
|
||||
#include <system_error>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
@@ -494,6 +513,69 @@ private:
|
||||
bool execute_on_destruction;
|
||||
};
|
||||
|
||||
// Simple from_chars implementation for integer and double types (C++17
|
||||
// substitute)
|
||||
template <typename T> struct from_chars_result {
|
||||
const char *ptr;
|
||||
std::errc ec;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline from_chars_result<T> from_chars(const char *first, const char *last,
|
||||
T &value, int base = 10) {
|
||||
value = 0;
|
||||
const char *p = first;
|
||||
bool negative = false;
|
||||
|
||||
if (p != last && *p == '-') {
|
||||
negative = true;
|
||||
++p;
|
||||
}
|
||||
if (p == last) { return {first, std::errc::invalid_argument}; }
|
||||
|
||||
T result = 0;
|
||||
for (; p != last; ++p) {
|
||||
char c = *p;
|
||||
int digit = -1;
|
||||
if ('0' <= c && c <= '9') {
|
||||
digit = c - '0';
|
||||
} else if ('a' <= c && c <= 'z') {
|
||||
digit = c - 'a' + 10;
|
||||
} else if ('A' <= c && c <= 'Z') {
|
||||
digit = c - 'A' + 10;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
if (digit < 0 || digit >= base) { break; }
|
||||
if (result > ((std::numeric_limits<T>::max)() - digit) / base) {
|
||||
return {p, std::errc::result_out_of_range};
|
||||
}
|
||||
result = result * base + digit;
|
||||
}
|
||||
|
||||
if (p == first || (negative && p == first + 1)) {
|
||||
return {first, std::errc::invalid_argument};
|
||||
}
|
||||
|
||||
value = negative ? -result : result;
|
||||
return {p, std::errc{}};
|
||||
}
|
||||
|
||||
// from_chars for double (simple wrapper for strtod)
|
||||
inline from_chars_result<double> from_chars(const char *first, const char *last,
|
||||
double &value) {
|
||||
std::string s(first, last);
|
||||
char *endptr = nullptr;
|
||||
errno = 0;
|
||||
value = std::strtod(s.c_str(), &endptr);
|
||||
if (endptr == s.c_str()) { return {first, std::errc::invalid_argument}; }
|
||||
if (errno == ERANGE) {
|
||||
return {first + (endptr - s.c_str()), std::errc::result_out_of_range};
|
||||
}
|
||||
return {first + (endptr - s.c_str()), std::errc{}};
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
enum SSLVerifierResponse {
|
||||
@@ -1848,10 +1930,11 @@ private:
|
||||
Result send_(Request &&req);
|
||||
|
||||
socket_t create_client_socket(Error &error) const;
|
||||
bool read_response_line(Stream &strm, const Request &req,
|
||||
Response &res) const;
|
||||
bool read_response_line(Stream &strm, const Request &req, Response &res,
|
||||
bool skip_100_continue = true) const;
|
||||
bool write_request(Stream &strm, Request &req, bool close_connection,
|
||||
Error &error);
|
||||
Error &error, bool skip_body = false);
|
||||
bool write_request_body(Stream &strm, Request &req, Error &error);
|
||||
void prepare_default_headers(Request &r, bool for_stream,
|
||||
const std::string &ct);
|
||||
bool redirect(Request &req, Response &res, Error &error);
|
||||
@@ -3243,10 +3326,11 @@ private:
|
||||
msg.id = value;
|
||||
} else if (field == "retry") {
|
||||
// Parse retry interval in milliseconds
|
||||
try {
|
||||
retry_ms = std::stoi(value);
|
||||
} catch (...) {
|
||||
// Invalid retry value, ignore
|
||||
{
|
||||
int v = 0;
|
||||
auto res =
|
||||
detail::from_chars(value.data(), value.data() + value.size(), v);
|
||||
if (res.ec == std::errc{}) { retry_ms = v; }
|
||||
}
|
||||
}
|
||||
// Unknown fields are ignored per SSE spec
|
||||
|
||||
Reference in New Issue
Block a user