Compare commits

..

8 Commits

Author SHA1 Message Date
Georgi Gerganov d838c22bb3 spec : fix the check-rate logic of ngram-simple (#19261)
* spec : fix the check-rate logic of ngram-simple

* cont : refactor + fix checks
2026-02-04 10:39:53 +02:00
Daniel Bevenius 25f40ca65f completion : simplify batch (embd) processing (#19286)
* completion : simplify batch (embd) processing

This commit simplifies the processing of embd by removing the for loop
that currently exists which uses params.n_batch as its increment. This
commit also removes the clamping of n_eval as the size of embd is always
at most the size of params.n_batch.

The motivation is to clarify the code as it is currently a little
confusing when looking at this for loop in isolation and thinking that
it can process multiple batches.

* add an assert to verify n_eval is not greater than n_batch
2026-02-04 05:43:28 +01:00
Kevin Pouget 015deb9048 ggml-virtgpu: make the code thread safe (#19204)
* ggml-virtgpu: regenerate_remoting.py: add the ability to deprecate a function

* ggml-virtgpu: deprecate buffer_type is_host remoting

not necessary

* ggml-virtgpu: stop using static vars as cache

The static init isn't thread safe.

* ggml-virtgpu: protect the use of the shared memory to transfer data

* ggml-virtgpu: make the remote calls thread-safe

* ggml-virtgpu: backend: don't continue if couldn't allocate the tensor memory

* ggml-virtgpu: add a cleanup function for consistency

* ggml-virtgpu: backend: don't crash if buft->iface.get_max_size is missing

* fix style and ordering

* Remove the static variable in apir_device_get_count

* ggml-virtgpu: improve the logging

* fix review minor formatting changes
2026-02-04 10:46:18 +08:00
Aman Gupta 2ceda3f662 ggml-cpu: use LUT for converting e8->f32 scales on x86 (#19288)
* ggml-cpu: use LUT for converting e8->f32 scales on x86

* add dispatch based on macro
2026-02-04 09:43:29 +08:00
Georgi Gerganov 44008ce8f9 metal : add solve_tri (#19302) 2026-02-03 23:43:14 +02:00
Georgi Gerganov 6a9bf2f788 ci : add sanitizer runs for server (#19291) 2026-02-03 22:41:20 +02:00
Georgi Gerganov faa1bc26ee sampling : delegate input allocation to the scheduler (#19266)
* sampling : delegate input allocation to the scheduler

* graph : compute backend samplers only if needed
2026-02-03 22:16:16 +02:00
Ruben Ortlam 32b17abdb0 vulkan: disable coopmat1 fa on Nvidia Turing (#19290) 2026-02-03 17:37:32 +01:00
48 changed files with 739 additions and 389 deletions
+12 -4
View File
@@ -36,7 +36,7 @@ jobs:
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
build_type: [RelWithDebInfo]
include:
- build_type: Release
@@ -45,7 +45,7 @@ jobs:
- build_type: Release
sanitizer: ""
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
fail-fast: false
steps:
- name: Dependencies
@@ -72,7 +72,15 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
cmake -B build \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_SCHED_NO_REALLOC=ON \
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
@@ -88,7 +96,7 @@ jobs:
- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) && matrix.build_type == 'Release' }}
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
run: |
cd tools/server/tests
export ${{ matrix.extra_args }}
-23
View File
@@ -164,29 +164,6 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
message(STATUS "Using -fsanitize=thread")
add_compile_options(-fsanitize=thread)
link_libraries (-fsanitize=thread)
endif()
if (LLAMA_SANITIZE_ADDRESS)
message(STATUS "Using -fsanitize=address")
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries (-fsanitize=address)
endif()
if (LLAMA_SANITIZE_UNDEFINED)
message(STATUS "Using -fsanitize=undefined")
add_compile_options(-fsanitize=undefined)
link_libraries (-fsanitize=undefined)
endif()
endif()
include("cmake/license.cmake")
license_add_file("llama.cpp" "LICENSE")
+23
View File
@@ -32,4 +32,27 @@ function(llama_add_compile_flags)
set(CXX_FLAGS "" PARENT_SCOPE)
endif()
endif()
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
message(STATUS "Using -fsanitize=thread")
add_compile_options(-fsanitize=thread)
link_libraries (-fsanitize=thread)
endif()
if (LLAMA_SANITIZE_ADDRESS)
message(STATUS "Using -fsanitize=address")
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries (-fsanitize=address)
endif()
if (LLAMA_SANITIZE_UNDEFINED)
message(STATUS "Using -fsanitize=undefined")
add_compile_options(-fsanitize=undefined)
link_libraries (-fsanitize=undefined)
endif()
endif()
endfunction()
+3 -12
View File
@@ -47,21 +47,15 @@ static std::string common_tokens_to_str(const llama_tokens & inp, size_t start,
* @return Vector of draft tokens, empty if no matching pattern is found
*/
llama_tokens common_ngram_simple_draft(
common_ngram_simple_state & state,
const common_ngram_simple_config & config,
const llama_tokens & tokens, llama_token sampled) {
// Simple implementation of self-speculative decoding without a draft model.
//
const size_t cur_len = tokens.size();
// Only check every check_rate tokens to save compute
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
if (state.idx_last_check + state.config.check_rate > cur_len) {
llama_tokens draft_tokens;
return draft_tokens;
}
size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history
size_t n_draft_max = state.config.size_mgram; // the m-gram following the found n-gram is used for draft
const size_t n_draft_min = config.size_ngram; // size of n-gram to lookup in token history
const size_t n_draft_max = config.size_mgram; // the m-gram following the found n-gram is used for draft
// vector for tokens we want to verify.
// return empty vector if there is no match.
@@ -80,9 +74,6 @@ llama_tokens common_ngram_simple_draft(
}
pattern.push_back(sampled); // add the last token to the pattern
// We do a search in the token history.
state.idx_last_check = cur_len;
size_t match_pos = 0; // we ignore position 0, position 0 == no match
// search backwards, but skip the current match (we are currently there)
for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) {
+1 -15
View File
@@ -27,23 +27,9 @@ struct common_ngram_simple_config {
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
};
// current state (and config) of n-gram simple.
struct common_ngram_simple_state {
common_ngram_simple_config config;
size_t idx_last_check = 0; // index of last check in context history (mutable)
common_ngram_simple_state(const common_ngram_simple_config & config)
: config(config) {}
};
// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
// state: the ngram simple state to search in.
// inp: the tokens generated so far.
// sampled: the token that was just sampled.
// draft: vector to store the draft tokens, initially empty.
llama_tokens common_ngram_simple_draft(
common_ngram_simple_state & state,
const common_ngram_simple_config & config,
const llama_tokens & tokens, llama_token sampled);
+14 -6
View File
@@ -463,12 +463,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
// state of self-speculation (simple implementation, not ngram-map)
struct common_speculative_state_ngram_simple : public common_speculative_state {
common_ngram_simple_state state;
common_ngram_simple_config config;
uint16_t check_id = 0; // used to control the frequency of generating drafts
common_speculative_state_ngram_simple(
enum common_speculative_type type,
common_ngram_simple_state state)
: common_speculative_state(type), state(state) {}
common_ngram_simple_config config)
: common_speculative_state(type), config(config) {}
void begin(const llama_tokens & prompt) override {
GGML_UNUSED(prompt);
@@ -479,7 +481,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
result = common_ngram_simple_draft(state, prompt_tgt, id_last);
++check_id;
if (check_id < config.check_rate) {
return;
}
check_id = 0;
result = common_ngram_simple_draft(config, prompt_tgt, id_last);
GGML_UNUSED(params);
}
@@ -889,14 +897,14 @@ common_speculative * common_speculative_init(
uint16_t mgram_size_value = ngram_map.size_value;
uint16_t check_rate = ngram_map.check_rate;
auto config_simple = common_ngram_simple_config{
auto config_simple = common_ngram_simple_config {
/* .size_ngram = */ ngram_size_key,
/* .size_mgram = */ mgram_size_value,
/* .check_rate = */ check_rate
};
auto state = std::make_unique<common_speculative_state_ngram_simple>(
/* .type = */ config.type,
/* .state = */ common_ngram_simple_state(config_simple)
/* .state = */ config_simple
);
impls.push_back(std::move(state));
break;
-2
View File
@@ -7,8 +7,6 @@
extern "C" {
#endif
#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_virtgpu_reg();
#ifdef __cplusplus
+9 -9
View File
@@ -268,9 +268,9 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
}
static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const uint8_t x1, const float y1) {
return _mm256_set_m128(_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
}
#endif
#elif defined(__SSSE3__)
@@ -782,6 +782,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256 accum1 = _mm256_setzero_ps();
__m256 accum2 = _mm256_setzero_ps();
for (; ib + 1 < nb; ib += 2) {
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
@@ -795,10 +796,10 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
_mm256_cvtepi32_ps(p_1), accum1);
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
_mm256_cvtepi32_ps(p_2), accum2);
const __m256 scale0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 0].e));
const __m256 scale1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 1].e));
accum1 = _mm256_fmadd_ps(scale0, _mm256_cvtepi32_ps(p_1), accum1);
accum2 = _mm256_fmadd_ps(scale1, _mm256_cvtepi32_ps(p_2), accum2);
}
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
@@ -830,7 +831,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
#endif
for (; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib].e);
int sumi1 = 0;
int sumi2 = 0;
for (int j = 0; j < QK_MXFP4/2; ++j) {
@@ -3817,4 +3818,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
+8
View File
@@ -75,6 +75,9 @@
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
float ggml_table_f32_f16[1 << 16];
// precomputed f32 table for e8m0 half (1 KB) (simd-mappings.h)
float ggml_table_f32_e8m0_half[1 << 8];
#if defined(__ARM_ARCH)
struct ggml_arm_arch_features_type {
int sve_cnt;
@@ -3681,6 +3684,11 @@ void ggml_cpu_init(void) {
ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
}
// initialize E8M0 half table (256 entries)
for (int i = 0; i < (1 << 8); ++i) {
ggml_table_f32_e8m0_half[i] = GGML_E8M0_TO_FP32_HALF(i);
}
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
+11
View File
@@ -116,6 +116,17 @@ extern "C" {
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
extern float ggml_table_f32_f16[1 << 16];
// precomputed f32 table for e8m0 half (1 KB)
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
extern float ggml_table_f32_e8m0_half[1 << 8];
// Use lookup table for E8M0 on x86 (faster than bit manipulation)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
#else
#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
#endif
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
+30
View File
@@ -534,6 +534,36 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv(ggml_metal_
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_metal_library_t lib, const ggml_tensor * op) {
char base[256];
char name[256];
const int nsg = 8;
const int n = op->src[1]->ne[1];
const int k = op->src[1]->ne[0];
snprintf(base, 256, "kernel_solve_tri_%s", ggml_type_name(op->src[0]->type));
snprintf(name, 256, "%s_nsg=%d_n=%d_k=%d", base, nsg, n, k);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_int16(cv, nsg, FC_SOLVE_TRI + 0);
ggml_metal_cv_set_int16(cv, n, FC_SOLVE_TRI + 1);
ggml_metal_cv_set_int16(cv, k, FC_SOLVE_TRI + 2);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
ggml_metal_cv_free(cv);
}
res.nsg = nsg;
res.smem = GGML_PAD(GGML_PAD(n, 32)*nsg*sizeof(float), 16);
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, ggml_type tsrc0, ggml_type tsrc1, int nsg, int nxpsg, int r1ptg) {
char base[256];
char name[256];
+1
View File
@@ -121,6 +121,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
+1
View File
@@ -1152,6 +1152,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
return has_simdgroup_reduction;
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
case GGML_OP_SOLVE_TRI:
return true;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
+29 -1
View File
@@ -78,7 +78,8 @@
#define FC_MUL_MM 700
#define FC_ROPE 800
#define FC_SSM_CONV 900
#define FC_COUNT_EQUAL 1000
#define FC_SOLVE_TRI 1000
#define FC_COUNT_EQUAL 1100
// op-specific constants
#define OP_FLASH_ATTN_EXT_NQPSG 8
@@ -733,6 +734,33 @@ typedef struct {
uint64_t nb0;
} ggml_metal_kargs_ssm_scan;
typedef struct {
int32_t ne00;
int32_t ne01;
int32_t ne02;
int32_t ne03;
uint64_t nb00;
uint64_t nb01;
uint64_t nb02;
uint64_t nb03;
int32_t ne10;
int32_t ne11;
int32_t ne12;
int32_t ne13;
uint64_t nb10;
uint64_t nb11;
uint64_t nb12;
uint64_t nb13;
int32_t ne0;
int32_t ne1;
int32_t ne2;
int32_t ne3;
uint64_t nb0;
uint64_t nb1;
uint64_t nb2;
uint64_t nb3;
} ggml_metal_kargs_solve_tri;
typedef struct {
int32_t ne00t;
int32_t ne00;
+61
View File
@@ -341,6 +341,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
{
n_fuse = ggml_metal_op_rwkv(ctx, idx);
} break;
case GGML_OP_SOLVE_TRI:
{
n_fuse = ggml_metal_op_solve_tri(ctx, idx);
} break;
case GGML_OP_MUL_MAT:
{
n_fuse = ggml_metal_op_mul_mat(ctx, idx);
@@ -1557,6 +1561,63 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
return 1;
}
int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
ggml_tensor * op = ctx->node(idx);
ggml_metal_library_t lib = ctx->lib;
ggml_metal_encoder_t enc = ctx->enc;
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_kargs_solve_tri args = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne10 =*/ ne10,
/*.ne11 =*/ ne11,
/*.ne12 =*/ ne12,
/*.ne13 =*/ ne13,
/*.nb10 =*/ nb10,
/*.nb11 =*/ nb11,
/*.nb12 =*/ nb12,
/*.nb13 =*/ nb13,
/*.ne0 =*/ ne0,
/*.ne1 =*/ ne1,
/*.ne2 =*/ ne2,
/*.ne3 =*/ ne3,
/*.nb0 =*/ nb0,
/*.nb1 =*/ nb1,
/*.nb2 =*/ nb2,
/*.nb3 =*/ nb3,
};
auto pipeline = ggml_metal_library_get_pipeline_solve_tri(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
const int nsg = pipeline.nsg;
ggml_metal_encoder_set_threadgroup_memory_size(enc, pipeline.smem, 0);
ggml_metal_encoder_dispatch_threadgroups(enc, (ne10 + nsg - 1)/nsg, ne02, ne03, 32, nsg, 1);
return 1;
}
int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
ggml_tensor * op = ctx->node(idx);
+1
View File
@@ -60,6 +60,7 @@ int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_pool_1d (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_pool_2d (ggml_metal_op_t ctx, int idx);
+77
View File
@@ -2737,6 +2737,83 @@ kernel void kernel_rwkv_wkv7_f32(
}
}
constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]];
constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]];
constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]];
kernel void kernel_solve_tri_f32(
constant ggml_metal_kargs_solve_tri & args,
device const char * src0,
device const char * src1,
device char * dst,
threadgroup char * shmem [[threadgroup(0)]],
ushort3 tgpig[[threadgroup_position_in_grid]],
ushort sgitg[[simdgroup_index_in_threadgroup]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
constexpr short NW = N_SIMDWIDTH;
const short NSG = FC_solve_tri_nsg;
const short N = FC_solve_tri_n;
const short K = FC_solve_tri_k;
const short NP = PAD2(N, NW);
const int32_t ne02 = args.ne02;
const int32_t ne03 = args.ne03;
const int32_t i03 = tgpig.z;
const int32_t i02 = tgpig.y;
const int32_t i01 = tgpig.x*NSG + sgitg;
threadgroup float * sh0 = (threadgroup float *) shmem;
device const float * src0_ptr = (device const float *)(src0 + i02 * args.nb02 + i03 * args.nb03) + sgitg*N;
device const float * src1_ptr = (device const float *)(src1 + i02 * args.nb12 + i03 * args.nb13) + i01;
device float * dst_ptr = (device float *)(dst + i02 * args.nb2 + i03 * args.nb3) + i01;
for (short rr = 0; rr < N; rr += NSG) {
threadgroup_barrier(mem_flags::mem_threadgroup);
{
threadgroup float * sh0_cur = sh0 + sgitg*NP;
for (short t = 0; t*NW < N; ++t) {
const short idx = t*NW + tiisg;
sh0_cur[idx] = src0_ptr[idx];
}
src0_ptr += NSG*N;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (i01 >= args.ne10) {
continue;
}
for (short ir = 0; ir < NSG && rr + ir < N; ++ir) {
const short r = rr + ir;
threadgroup float * sh0_cur = sh0 + ir*NP;
float sum = 0.0f;
for (short t = 0; t*NW < r; ++t) {
const short idx = t*NW + tiisg;
sum += sh0_cur[idx] * dst_ptr[idx*K] * (idx < r);
}
sum = simd_sum(sum);
if (tiisg == 0) {
const float diag = sh0_cur[r];
dst_ptr[r*K] = (src1_ptr[r*K] - sum) / diag;
}
}
}
}
kernel void kernel_argmax_f32(
constant ggml_metal_kargs_argmax & args,
device const char * src0,
@@ -36,7 +36,7 @@ apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor) {
result.data = reinterpret_cast<uint64_t>(tensor->data);
if (tensor->data) {
if (!tensor->buffer) {
GGML_ABORT("tensor has data but not buffer");
GGML_ABORT("%s: tensor has data but not buffer", __func__);
}
// tensor->data is serialized as an offset to the buffer base address
result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
@@ -27,7 +27,7 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
const void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_data) {
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
apir_decoder_set_fatal(dec);
return 1;
}
@@ -45,7 +45,7 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
if (dev->iface.supports_op(dev, op)) {
continue;
}
GGML_LOG_ERROR("Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
status = GGML_STATUS_ABORTED;
apir_encode_ggml_status(enc, &status);
@@ -36,18 +36,22 @@ uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec
ggml_backend_buffer_type_t buft;
buft = apir_decode_ggml_buffer_type(dec);
size_t value = buft->iface.get_max_size(buft);
size_t value = SIZE_MAX;
if (buft->iface.get_max_size) {
value = buft->iface.get_max_size(buft);
}
apir_encode_size_t(enc, &value);
return 0;
}
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */
uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
GGML_UNUSED(ctx);
ggml_backend_buffer_type_t buft;
buft = apir_decode_ggml_buffer_type(dec);
GGML_UNUSED(dec);
const bool is_host = false;
bool is_host = buft->iface.is_host(buft);
apir_encode_bool_t(enc, &is_host);
return 0;
@@ -40,7 +40,7 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_data) {
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
return 1;
}
@@ -71,7 +71,7 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_data) {
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
return 1;
}
@@ -121,7 +121,7 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
buffer = apir_decode_ggml_buffer(dec);
if (!apir_untrack_backend_buffer(buffer)) {
GGML_LOG_WARN("%s: unknown buffer %p\n", __func__, (void *) buffer);
GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer);
return 1;
}
@@ -124,7 +124,7 @@ uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec,
void * shmem_ptr = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_ptr) {
GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n");
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
apir_decoder_set_fatal(dec);
return 1;
}
@@ -17,26 +17,26 @@ uint64_t timer_count = 0;
uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
if (reg != NULL) {
GGML_LOG_WARN("%s: already initialized\n", __func__);
GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: already initialized\n", __func__);
return APIR_BACKEND_INITIALIZE_ALREADY_INITED;
}
ggml_backend_reg_t (*ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
reg = ggml_backend_reg_fct();
if (reg == NULL) {
GGML_LOG_ERROR("%s: backend registration failed\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend registration failed\n", __func__);
return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
}
if (!reg->iface.get_device_count(reg)) {
GGML_LOG_ERROR("%s: backend initialization failed: no device found\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__);
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
}
dev = reg->iface.get_device(reg, 0);
if (!dev) {
GGML_LOG_ERROR("%s: backend initialization failed: no device received\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__);
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
}
@@ -16,6 +16,7 @@ uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec,
uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */
uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
@@ -62,7 +63,7 @@ static inline const char * backend_dispatch_command_name(ApirBackendCommandType
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
return "backend_buffer_type_get_max_size";
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
return "backend_buffer_type_is_host";
return "backend_buffer_type_is_host (DEPRECATED)";
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
return "backend_buffer_type_alloc_buffer";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
@@ -110,7 +111,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = */ backend_buffer_type_get_name,
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = */ backend_buffer_type_get_alignment,
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = */ backend_buffer_type_get_max_size,
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host,
/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host /* DEPRECATED */,
/* APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = */ backend_buffer_type_alloc_buffer,
/* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = */ backend_buffer_type_get_alloc_size,
@@ -11,6 +11,8 @@
#include "shared/apir_cs.h"
#include "shared/apir_cs_ggml.h"
#define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: "
struct virgl_apir_context {
uint32_t ctx_id;
virgl_apir_callbacks * iface;
+18 -14
View File
@@ -35,14 +35,8 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) {
buffer->iface.free_buffer(buffer);
}
if (dev) {
size_t free, total;
dev->iface.get_memory(dev, &free, &total);
GGML_LOG_INFO("%s: free memory: %ld MB\n", __func__, (size_t) free / 1024 / 1024);
}
if (backend_library_handle) {
GGML_LOG_INFO("%s: The GGML backend library was loaded. Unloading it.\n", __func__);
GGML_LOG_INFO(GGML_VIRTGPU_BCK "The GGML backend library was loaded. Unloading it.\n");
dlclose(backend_library_handle);
backend_library_handle = NULL;
}
@@ -65,7 +59,7 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
if (apir_logfile) {
ggml_log_set(log_to_file_callback, apir_logfile);
} else {
GGML_LOG_INFO("Could not open the log file at '%s'\n", apir_log_to_file);
GGML_LOG_INFO(GGML_VIRTGPU_BCK "Could not open the log file at '%s'\n", apir_log_to_file);
}
}
@@ -74,7 +68,10 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
if (!library_name) {
GGML_LOG_ERROR("cannot open the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot open the GGML library: env var '%s' not defined\n",
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
}
@@ -82,13 +79,16 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
backend_library_handle = dlopen(library_name, RTLD_LAZY);
if (!backend_library_handle) {
GGML_LOG_ERROR("cannot open the GGML library: %s\n", dlerror());
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot open the GGML library: %s\n", __func__, dlerror());
return APIR_LOAD_LIBRARY_CANNOT_OPEN;
}
if (!library_reg) {
GGML_LOG_ERROR("cannot register the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot register the GGML library: env var '%s' not defined\n",
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
}
@@ -96,8 +96,10 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
dlsym_error = dlerror();
if (dlsym_error) {
GGML_LOG_ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s\n", library_reg,
APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
__func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
}
@@ -134,7 +136,9 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
};
if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
GGML_LOG_ERROR("Received an invalid dispatch index (%d >= %d)\n", cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: Received an invalid dispatch index (%d >= %d)\n",
__func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
return APIR_BACKEND_FORWARD_INDEX_INVALID;
}
@@ -86,7 +86,7 @@ static inline bool apir_decoder_peek_internal(apir_decoder * dec,
assert(val_size <= size);
if (unlikely(size > (size_t) (dec->end - dec->cur))) {
GGML_LOG_ERROR("reading too much from the decoder ...\n");
GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__);
apir_decoder_set_fatal(dec);
memset(val, 0, val_size);
return false;
@@ -103,7 +103,7 @@ static inline void apir_decoder_peek(apir_decoder * dec, size_t size, void * val
static inline const void * apir_decoder_use_inplace(apir_decoder * dec, size_t size) {
if (unlikely(size > (size_t) (dec->end - dec->cur))) {
GGML_LOG_ERROR("reading too much from the decoder ...\n");
GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__);
apir_decoder_set_fatal(dec);
return NULL;
}
@@ -221,7 +221,7 @@ static inline uint64_t apir_decode_array_size(apir_decoder * dec, uint64_t expec
uint64_t size;
apir_decode_uint64_t(dec, &size);
if (size != expected_size) {
GGML_LOG_ERROR("Couldn't decode array from the decoder\n");
GGML_LOG_ERROR("%s: Couldn't decode array from the decoder\n", __func__);
apir_decoder_set_fatal(dec);
size = 0;
}
@@ -322,7 +322,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t
if (size) {
val[size - 1] = '\0';
} else {
GGML_LOG_ERROR("Couldn't decode the blog array\n");
GGML_LOG_ERROR("%s: Couldn't decode the blog array\n", __func__);
apir_decoder_set_fatal(dec);
}
}
@@ -332,7 +332,8 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t
static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
size_t alloc_size;
if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
GGML_LOG_ERROR("overflow in array allocation of %zu * %zu bytes\n", size, count);
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n",
__func__, size, count);
return NULL;
}
@@ -39,11 +39,17 @@ static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor
static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec);
if (!apir_rpc_tensor) {
return NULL;
}
ggml_init_params params{
/*.mem_size =*/ ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor);
@@ -71,6 +77,10 @@ static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decod
return (ggml_backend_buffer_type_t) handle;
}
static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) {
apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
}
static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) {
apir_buffer_type_host_handle_t handle;
@@ -154,13 +164,13 @@ static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml
size_t tensor_size = sizeof(*tensor);
if (tensor->extra) {
GGML_ABORT("Cannot pass tensors with extra");
GGML_ABORT("%s: Cannot pass tensors with extra", __func__);
}
if (tensor->src[0] && tensor->buffer) {
static int first = 1;
if (first) {
GGML_LOG_WARN("Cannot pass tensors with src and buffer\n");
GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__);
first = 0;
}
}
@@ -6,7 +6,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
if (!context) {
GGML_ABORT("Couldn't allocate the buffer context ...");
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the buffer context ...", __func__);
}
context->gpu = gpu;
@@ -20,7 +20,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
context->base = context->apir_context.shmem.mmap_ptr;
context->is_from_ptr = true;
} else {
context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
context->apir_context = apir_buffer_type_alloc_buffer(gpu, gpu->cached_buffer_type.host_handle, size);
context->is_from_ptr = false;
context->base = NULL;
}
@@ -34,36 +34,19 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
virtgpu * gpu = BUFT_TO_GPU(buft);
return apir_buffer_type_get_name(gpu, buft);
return gpu->cached_buffer_type.name;
}
static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
virtgpu * gpu = BUFT_TO_GPU(buft);
static size_t align = 0;
if (align == 0) {
align = apir_buffer_type_get_alignment(gpu, buft);
}
return align;
return gpu->cached_buffer_type.alignment;
}
static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
virtgpu * gpu = BUFT_TO_GPU(buft);
static size_t max_size = 0;
if (max_size == 0) {
max_size = apir_buffer_type_get_max_size(gpu, buft);
}
return max_size;
}
static bool ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
virtgpu * gpu = BUFT_TO_GPU(buft);
return apir_buffer_type_is_host(gpu, buft);
return gpu->cached_buffer_type.max_size;
}
static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
@@ -76,7 +59,7 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff
return ggml_nbytes(tensor);
}
return apir_buffer_type_get_alloc_size(gpu, buft, tensor);
return apir_buffer_type_get_alloc_size(gpu, gpu->cached_buffer_type.host_handle, tensor);
}
const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+37 -24
View File
@@ -3,32 +3,27 @@
static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
return apir_device_get_name(gpu);
return gpu->cached_device_info.name;
}
static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
return apir_device_get_description(gpu);
// Return the pre-cached description from the virtgpu structure
return gpu->cached_device_info.description;
}
static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
static enum ggml_backend_dev_type type;
static bool has_type = false;
if (!has_type) {
has_type = true;
type = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
}
return type;
return (enum ggml_backend_dev_type) gpu->cached_device_info.type;
}
static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
virtgpu * gpu = DEV_TO_GPU(dev);
return apir_device_get_memory(gpu, free, total);
*free = gpu->cached_device_info.memory_free;
*total = gpu->cached_device_info.memory_total;
}
static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
@@ -77,13 +72,22 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_
ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
static std::atomic<bool> initialized = false;
static ggml_backend_buffer_type buft;
static ggml_backend_buffer_type buft{
/* .iface = */ ggml_backend_remoting_buffer_type_interface,
/* .device = */ dev,
/* .context = */ (void *) ctx,
};
if (!initialized) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
buft = {
/* .iface = */ ggml_backend_remoting_buffer_type_interface,
/* .device = */ dev,
/* .context = */ (void *) gpu->cached_buffer_type.host_handle,
};
initialized = true;
}
}
return &buft;
}
@@ -91,13 +95,22 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac
static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
static std::atomic<bool> initialized = false;
static ggml_backend_buffer_type buft;
static ggml_backend_buffer_type buft{
/* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
/* .device = */ dev,
/* .context = */ (void *) ctx,
};
if (!initialized) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
buft = {
/* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
/* .device = */ dev,
/* .context = */ (void *) gpu->cached_buffer_type.host_handle,
};
initialized = true;
}
}
return &buft;
}
@@ -110,7 +123,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_b
ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
if (!context) {
GGML_ABORT("Couldn't allocate the buffer context ...");
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the buffer context ...", __func__);
}
context->gpu = gpu;
+73 -21
View File
@@ -4,37 +4,70 @@
#include <iostream>
#include <mutex>
void ggml_virtgpu_cleanup(virtgpu * gpu);
static virtgpu * apir_initialize() {
static virtgpu * apir_gpu_instance = NULL;
static bool apir_initialized = false;
static virtgpu * gpu = NULL;
static std::atomic<bool> initialized = false;
if (initialized) {
// fast track
return gpu;
}
{
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (apir_initialized) {
return apir_gpu_instance;
if (initialized) {
// thread safe
return gpu;
}
apir_gpu_instance = create_virtgpu();
if (!apir_gpu_instance) {
GGML_ABORT("failed to initialize the virtgpu");
gpu = create_virtgpu();
if (!gpu) {
initialized = true;
return NULL;
}
apir_initialized = true;
// Pre-fetch and cache all device information, it will not change
gpu->cached_device_info.description = apir_device_get_description(gpu);
if (!gpu->cached_device_info.description) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__);
}
gpu->cached_device_info.name = apir_device_get_name(gpu);
if (!gpu->cached_device_info.name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__);
}
gpu->cached_device_info.device_count = apir_device_get_count(gpu);
gpu->cached_device_info.type = apir_device_get_type(gpu);
apir_device_get_memory(gpu,
&gpu->cached_device_info.memory_free,
&gpu->cached_device_info.memory_total);
apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu);
gpu->cached_buffer_type.host_handle = buft_host_handle;
gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle);
if (!gpu->cached_buffer_type.name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__);
}
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
initialized = true;
}
return apir_gpu_instance;
return gpu;
}
static int ggml_backend_remoting_get_device_count() {
virtgpu * gpu = apir_initialize();
if (!gpu) {
GGML_LOG_WARN("apir_initialize failed\n");
return 0;
}
return apir_device_get_count(gpu);
return gpu->cached_device_info.device_count;
}
static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -52,17 +85,21 @@ ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) {
static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
if (devices.size() > 0) {
GGML_LOG_INFO("%s: already initialized\n", __func__);
GGML_LOG_INFO(GGML_VIRTGPU "%s: already initialized\n", __func__);
return;
}
virtgpu * gpu = apir_initialize();
if (!gpu) {
GGML_LOG_ERROR("apir_initialize failed\n");
GGML_LOG_ERROR(GGML_VIRTGPU "%s: apir_initialize failed\n", __func__);
return;
}
static bool initialized = false;
static std::atomic<bool> initialized = false;
if (initialized) {
return; // fast track
}
{
static std::mutex mutex;
@@ -70,10 +107,10 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
if (!initialized) {
for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) {
ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
char desc[256] = "API Remoting device";
char desc[256] = "ggml-virtgpu API Remoting device";
ctx->device = i;
ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
ctx->name = GGML_VIRTGPU_NAME + std::to_string(i);
ctx->description = desc;
ctx->gpu = gpu;
@@ -98,7 +135,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
UNUSED(reg);
return GGML_REMOTING_FRONTEND_NAME;
return GGML_VIRTGPU_NAME;
}
static const ggml_backend_reg_i ggml_backend_remoting_reg_i = {
@@ -111,8 +148,7 @@ static const ggml_backend_reg_i ggml_backend_remoting_reg_i = {
ggml_backend_reg_t ggml_backend_virtgpu_reg() {
virtgpu * gpu = apir_initialize();
if (!gpu) {
GGML_LOG_ERROR("virtgpu_apir_initialize failed\n");
return NULL;
GGML_LOG_ERROR(GGML_VIRTGPU "%s: virtgpu_apir_initialize failed\n", __func__);
}
static ggml_backend_reg reg = {
@@ -129,9 +165,25 @@ ggml_backend_reg_t ggml_backend_virtgpu_reg() {
ggml_backend_remoting_reg_init_devices(&reg);
GGML_LOG_INFO("%s: initialized\n", __func__);
return &reg;
}
// public function, not exposed in the GGML interface at the moment
void ggml_virtgpu_cleanup(virtgpu * gpu) {
if (gpu->cached_device_info.name) {
free(gpu->cached_device_info.name);
gpu->cached_device_info.name = NULL;
}
if (gpu->cached_device_info.description) {
free(gpu->cached_device_info.description);
gpu->cached_device_info.description = NULL;
}
if (gpu->cached_buffer_type.name) {
free(gpu->cached_buffer_type.name);
gpu->cached_buffer_type.name = NULL;
}
mtx_destroy(&gpu->data_shmem_mutex);
}
GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg)
+4 -1
View File
@@ -8,6 +8,9 @@
#include <memory>
#include <string>
#define GGML_VIRTGPU_NAME "ggml-virtgpu"
#define GGML_VIRTGPU "ggml-virtgpu: "
// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes
#define USE_ALWAYS_TRUE_SUPPORTS_OP 1
@@ -62,7 +65,7 @@ static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggm
static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
if (!buffer->context) {
GGML_ABORT("%s: no context available :/", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: no context available :/", __func__);
}
return BUFFER_TO_HOST_HANDLE(buffer);
}
@@ -24,10 +24,10 @@ functions:
frontend_return: "int"
get_name:
frontend_return: "const char *"
frontend_return: "char *"
get_description:
frontend_return: "const char *"
frontend_return: "char *"
get_type:
frontend_return: "uint32_t"
@@ -64,35 +64,33 @@ functions:
group_description: "buffer-type"
functions:
get_name:
frontend_return: "const char *"
frontend_return: "char *"
frontend_extra_params:
- "ggml_backend_buffer_type_t buft"
- "apir_buffer_type_host_handle_t host_handle"
get_alignment:
frontend_return: "size_t"
frontend_extra_params:
- "ggml_backend_buffer_type_t buft"
- "apir_buffer_type_host_handle_t host_handle"
get_max_size:
frontend_return: "size_t"
frontend_extra_params:
- "ggml_backend_buffer_type_t buft"
- "apir_buffer_type_host_handle_t host_handle"
is_host:
frontend_return: "bool"
frontend_extra_params:
- "ggml_backend_buffer_type_t buft"
deprecated: true
alloc_buffer:
frontend_return: "apir_buffer_context_t"
frontend_extra_params:
- "ggml_backend_buffer_type_t buffer_buft"
- "apir_buffer_type_host_handle_t host_handle"
- "size_t size"
get_alloc_size:
frontend_return: "size_t"
frontend_extra_params:
- "ggml_backend_buffer_type_t buft"
- "apir_buffer_type_host_handle_t host_handle"
- "const ggml_tensor *op"
buffer:
+14 -4
View File
@@ -116,7 +116,7 @@ class RemotingCodebaseGenerator:
'frontend_return': func_metadata.get('frontend_return', 'void'),
'frontend_extra_params': func_metadata.get('frontend_extra_params', []),
'group_description': group_description,
'newly_added': func_metadata.get('newly_added', False)
'deprecated': func_metadata.get('deprecated', False),
})
enum_value += 1
@@ -165,6 +165,9 @@ class RemotingCodebaseGenerator:
signature = "uint32_t"
params = "apir_encoder *enc, apir_decoder *dec, virgl_apir_context *ctx"
if func['deprecated']:
decl_lines.append(f"/* {func['enum_name']} is deprecated. Keeping the handler for backward compatibility. */")
decl_lines.append(f"{signature} {func['backend_function']}({params});")
# Switch cases
@@ -176,7 +179,9 @@ class RemotingCodebaseGenerator:
switch_lines.append(f" /* {func['group_description']} */")
current_group = func['group_name']
switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}\";")
deprecated = " (DEPRECATED)" if func['deprecated'] else ""
switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";")
# Dispatch table
table_lines = []
@@ -188,7 +193,8 @@ class RemotingCodebaseGenerator:
table_lines.append("")
current_group = func['group_name']
table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']},")
deprecated = " /* DEPRECATED */" if func['deprecated'] else ""
table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']}{deprecated},")
header_content = f'''\
#pragma once
@@ -225,6 +231,10 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
decl_lines.append(f"/* {func['group_description']} */")
current_group = func['group_name']
if func['deprecated']:
decl_lines.append(f"/* {func['frontend_function']} is deprecated. */")
continue
# Build parameter list
params = [self.naming_patterns['frontend_base_param']]
params.extend(func['frontend_extra_params'])
@@ -287,7 +297,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
generated_files = [apir_backend_path, backend_dispatched_path, virtgpu_forward_path]
if not self.clang_format_available:
logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted."
logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted.\n"
" Install clang-format to enable automatic code formatting.")
else:
logging.info("\n🎨 Formatting files with clang-format...")
@@ -18,12 +18,17 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (cgraph_size <= gpu->data_shmem.mmap_size) {
// prefer the init-time allocated page, if large enough
// Lock mutex before using shared data_shmem buffer
if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
}
apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
@@ -42,7 +47,10 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
remote_call_finish(gpu, encoder, decoder);
if (shmem != &gpu->data_shmem) {
// Unlock mutex before cleanup
if (using_shared_shmem) {
mtx_unlock(&gpu->data_shmem_mutex);
} else {
virtgpu_shmem_destroy(gpu, shmem);
}
@@ -1,20 +1,20 @@
#include "virtgpu-forward-impl.h"
const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
char * apir_buffer_type_get_name(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
apir_encode_ggml_buffer_type(encoder, buft);
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
REMOTE_CALL(gpu, encoder, decoder, ret);
const size_t string_size = apir_decode_array_size_unchecked(decoder);
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
if (!string) {
GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__);
apir_decoder_set_fatal(decoder);
}
apir_decode_char_array(decoder, string, string_size);
@@ -24,14 +24,14 @@ const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t
return string;
}
size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
size_t apir_buffer_type_get_alignment(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
apir_encode_ggml_buffer_type(encoder, buft);
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
REMOTE_CALL(gpu, encoder, decoder, ret);
@@ -43,14 +43,14 @@ size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t
return alignment;
}
size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
apir_encode_ggml_buffer_type(encoder, buft);
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
REMOTE_CALL(gpu, encoder, decoder, ret);
@@ -62,26 +62,7 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t b
return max_size;
}
bool apir_buffer_type_is_host(virtgpu * gpu, ggml_backend_buffer_type_t buft) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
apir_encode_ggml_buffer_type(encoder, buft);
REMOTE_CALL(gpu, encoder, decoder, ret);
bool is_host;
apir_decode_bool_t(decoder, &is_host);
remote_call_finish(gpu, encoder, decoder);
return is_host;
}
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) {
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
@@ -90,7 +71,7 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
apir_encode_ggml_buffer_type(encoder, buft);
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
apir_encode_size_t(encoder, &size);
@@ -103,14 +84,14 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_
return buffer_context;
}
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op) {
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE);
apir_encode_ggml_buffer_type(encoder, buft);
apir_encode_apir_buffer_type_host_handle(encoder, host_handle);
apir_encode_ggml_tensor_inline(encoder, op);
@@ -36,13 +36,18 @@ void apir_buffer_set_tensor(virtgpu * gpu,
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (size <= gpu->data_shmem.mmap_size) {
// prefer the init-time allocated page, if large enough
// Lock mutex before using shared data_shmem buffer
if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
}
memcpy(shmem->mmap_ptr, data, size);
@@ -55,7 +60,10 @@ void apir_buffer_set_tensor(virtgpu * gpu,
remote_call_finish(gpu, encoder, decoder);
if (shmem != &gpu->data_shmem) {
// Unlock mutex before cleanup
if (using_shared_shmem) {
mtx_unlock(&gpu->data_shmem_mutex);
} else {
virtgpu_shmem_destroy(gpu, shmem);
}
@@ -79,13 +87,18 @@ void apir_buffer_get_tensor(virtgpu * gpu,
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (size <= gpu->data_shmem.mmap_size) {
// prefer the init-time allocated page, if large enough
// Lock mutex before using shared data_shmem buffer
if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) {
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
}
apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
@@ -98,7 +111,10 @@ void apir_buffer_get_tensor(virtgpu * gpu,
remote_call_finish(gpu, encoder, decoder);
if (shmem != &gpu->data_shmem) {
// Unlock mutex before cleanup
if (using_shared_shmem) {
mtx_unlock(&gpu->data_shmem_mutex);
} else {
virtgpu_shmem_destroy(gpu, shmem);
}
}
@@ -2,11 +2,6 @@
#include "virtgpu-shm.h"
int apir_device_get_count(virtgpu * gpu) {
static int32_t dev_count = -1;
if (dev_count != -1) {
return dev_count;
}
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
@@ -14,6 +9,7 @@ int apir_device_get_count(virtgpu * gpu) {
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
REMOTE_CALL(gpu, encoder, decoder, ret);
int32_t dev_count = -1;
apir_decode_int32_t(decoder, &dev_count);
remote_call_finish(gpu, encoder, decoder);
@@ -21,11 +17,7 @@ int apir_device_get_count(virtgpu * gpu) {
return dev_count;
}
const char * apir_device_get_name(virtgpu * gpu) {
static char * string = nullptr;
if (string) {
return string;
}
char * apir_device_get_name(virtgpu * gpu) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
@@ -34,9 +26,9 @@ const char * apir_device_get_name(virtgpu * gpu) {
REMOTE_CALL(gpu, encoder, decoder, ret);
const size_t string_size = apir_decode_array_size_unchecked(decoder);
string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
if (!string) {
GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__);
return NULL;
}
apir_decode_char_array(decoder, string, string_size);
@@ -46,7 +38,7 @@ const char * apir_device_get_name(virtgpu * gpu) {
return string;
}
const char * apir_device_get_description(virtgpu * gpu) {
char * apir_device_get_description(virtgpu * gpu) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
@@ -58,7 +50,7 @@ const char * apir_device_get_description(virtgpu * gpu) {
const size_t string_size = apir_decode_array_size_unchecked(decoder);
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
if (!string) {
GGML_LOG_ERROR("%s: Could not allocate the device description buffer\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device description buffer\n", __func__);
return NULL;
}
@@ -181,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) {
GGML_ABORT("Couldn't allocate the guest-host shared buffer");
GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer");
}
apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id);
+3 -3
View File
@@ -11,7 +11,7 @@
int32_t forward_flag = (int32_t) apir_command_type__; \
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \
if (!encoder_name) { \
GGML_ABORT("%s: failed to prepare the remote call encoder", __func__); \
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
} \
} while (0)
@@ -19,10 +19,10 @@
do { \
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
if (!decoder_name) { \
GGML_ABORT("%s: failed to kick the remote call", __func__); \
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
} \
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
GGML_ABORT("%s: failed to forward the API call: %s: code %d", __func__, \
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
apir_forward_error(ret_name), ret_name); \
} \
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
+11 -10
View File
@@ -3,8 +3,8 @@
/* device */
void apir_device_get_device_count(struct virtgpu * gpu);
int apir_device_get_count(struct virtgpu * gpu);
const char * apir_device_get_name(struct virtgpu * gpu);
const char * apir_device_get_description(struct virtgpu * gpu);
char * apir_device_get_name(struct virtgpu * gpu);
char * apir_device_get_description(struct virtgpu * gpu);
uint32_t apir_device_get_type(struct virtgpu * gpu);
void apir_device_get_memory(struct virtgpu * gpu, size_t * free, size_t * total);
bool apir_device_supports_op(struct virtgpu * gpu, const ggml_tensor * op);
@@ -17,14 +17,15 @@ void apir_device_get_props(struct virtgpu * gpu,
apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu, size_t size, size_t max_tensor_size);
/* buffer-type */
const char * apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
bool apir_buffer_type_is_host(struct virtgpu * gpu, ggml_backend_buffer_type_t buft);
apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
ggml_backend_buffer_type_t buffer_buft,
size_t size);
size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op);
char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
size_t size);
size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
const ggml_tensor * op);
/* buffer */
void * apir_buffer_get_base(struct virtgpu * gpu, apir_buffer_context_t * buffer_context);
+1 -2
View File
@@ -85,8 +85,7 @@ int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem) {
void * ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
if (!ptr) {
virtgpu_ioctl_gem_close(gpu, gem_handle);
GGML_LOG_ERROR("virtgpu_ioctl_map FAILED\n");
exit(1);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: virtgpu_ioctl_map failed\n", __func__);
return 1;
}
+105 -44
View File
@@ -33,7 +33,7 @@ static int virtgpu_handshake(virtgpu * gpu) {
encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HANDSHAKE, 0);
if (!encoder) {
GGML_ABORT("%s: failed to prepare the remote call encoder", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__);
return 1;
}
@@ -52,7 +52,7 @@ static int virtgpu_handshake(virtgpu * gpu) {
log_call_duration(call_duration_ns, "API Remoting handshake");
if (!decoder) {
GGML_ABORT(
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initiate the communication with the virglrenderer library. "
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
__func__);
@@ -65,7 +65,8 @@ static int virtgpu_handshake(virtgpu * gpu) {
uint32_t host_minor;
if (ret_magic != APIR_HANDSHAKE_MAGIC) {
GGML_ABORT("%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
GGML_ABORT(GGML_VIRTGPU
"%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
apir_backend_initialize_error(ret_magic));
} else {
apir_decode_uint32_t(decoder, &host_major);
@@ -78,13 +79,13 @@ static int virtgpu_handshake(virtgpu * gpu) {
return 1;
}
GGML_LOG_INFO("%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor);
GGML_LOG_INFO("%s: Host is running with %u.%u\n", __func__, host_major, host_minor);
GGML_LOG_INFO(GGML_VIRTGPU "%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor);
GGML_LOG_INFO(GGML_VIRTGPU "%s: Host is running with %u.%u\n", __func__, host_major, host_minor);
if (guest_major != host_major) {
GGML_LOG_ERROR("Host major (%d) and guest major (%d) version differ\n", host_major, guest_major);
GGML_LOG_ERROR(GGML_VIRTGPU "Host major (%d) and guest major (%d) version differ\n", host_major, guest_major);
} else if (guest_minor != host_minor) {
GGML_LOG_WARN("Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor);
GGML_LOG_WARN(GGML_VIRTGPU "Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor);
}
return 0;
@@ -97,7 +98,7 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LOADLIBRARY, 0);
if (!encoder) {
GGML_ABORT("%s: hypercall error: failed to prepare the remote call encoder", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: hypercall error: failed to prepare the API Remoting command encoder", __func__);
return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
}
@@ -108,36 +109,67 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
log_call_duration(call_duration_ns, "API Remoting LoadLibrary");
if (!decoder) {
GGML_ABORT("%s: hypercall error: failed to kick the API remoting hypercall.\n", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: hypercall error: failed to trigger the API Remoting hypercall.\n", __func__);
return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
}
remote_call_finish(gpu, encoder, decoder);
if (ret == APIR_LOAD_LIBRARY_SUCCESS) {
GGML_LOG_INFO("%s: The API Remoting backend was successfully loaded and initialized\n", __func__);
GGML_LOG_INFO(GGML_VIRTGPU "The API Remoting backend was successfully loaded and initialized\n");
return ret;
}
// something wrong happened, find out what.
if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
GGML_ABORT("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", __func__,
apir_load_library_error(ret), ret);
if (ret == APIR_LOAD_LIBRARY_ENV_VAR_MISSING) {
GGML_ABORT(GGML_VIRTGPU
"%s: virglrenderer could not open the API Remoting backend library, "
"some environment variables are missing. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(ret));
} else if (ret == APIR_LOAD_LIBRARY_CANNOT_OPEN) {
GGML_ABORT(GGML_VIRTGPU
"%s: virglrenderer could not open the API Remoting backend library. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(ret));
} else if (ret == APIR_LOAD_LIBRARY_ENV_VAR_MISSING) {
GGML_ABORT(GGML_VIRTGPU
"%s: could not load the backend library, some symbols are missing. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s) ",
__func__, apir_load_library_error(ret));
} else {
GGML_ABORT(GGML_VIRTGPU
"%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__,
apir_load_library_error(ret), ret);
}
return ret;
}
GGML_LOG_INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__);
GGML_LOG_INFO(GGML_VIRTGPU
"%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
GGML_ABORT("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s)",
if (apir_ret == APIR_LOAD_LIBRARY_CANNOT_OPEN) {
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
} else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) {
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
} else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)",
__func__, apir_ret, apir_load_library_error(apir_ret));
} else {
uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
GGML_ABORT("%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
lib_ret);
}
return ret;
@@ -149,38 +181,58 @@ virtgpu * create_virtgpu() {
gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr;
util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024);
// Initialize mutex to protect shared data_shmem buffer
if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) {
delete gpu;
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize data_shmem mutex", __func__);
return NULL;
}
if (virtgpu_open(gpu) != APIR_SUCCESS) {
GGML_ABORT("%s: failed to open the virtgpu device", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to open the virtgpu device\n", __func__);
return NULL;
}
if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
GGML_ABORT("%s: failed to initialize the GPU capset", __func__);
if (gpu->use_apir_capset) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__);
} else {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the virtgpu Venus capset", __func__);
}
return NULL;
}
if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
GGML_ABORT("%s: failed to initialize the GPU context", __func__);
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the GPU context", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
GGML_ABORT("%s: failed to create the shared reply memory pages", __func__);
GGML_ABORT(GGML_VIRTGPU
"%s: failed to create the shared reply memory pages", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
GGML_ABORT("%s: failed to create the shared data memory pages", __func__);
GGML_ABORT(GGML_VIRTGPU
"%s: failed to create the shared data memory pages", __func__);
return NULL;
}
if (virtgpu_handshake(gpu)) {
GGML_ABORT("%s: failed to handshake with the virglrenderer library", __func__);
GGML_ABORT(GGML_VIRTGPU
"%s: failed to handshake with the virglrenderer library", __func__);
return NULL;
}
if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
GGML_ABORT("%s: failed to load the backend library", __func__);
GGML_ABORT(GGML_VIRTGPU
"%s: failed to load the backend library", __func__);
return NULL;
}
@@ -191,7 +243,8 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
drmDevicePtr devs[8];
int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
if (count < 0) {
GGML_LOG_ERROR("%s: failed to enumerate DRM devices\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to enumerate DRM devices\n", __func__);
return APIR_ERROR_INITIALIZATION_FAILED;
}
@@ -213,16 +266,19 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d
int fd = open(node_path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
GGML_ABORT("failed to open %s", node_path);
GGML_ABORT(GGML_VIRTGPU
"%s: failed to open %s", __func__, node_path);
return APIR_ERROR_INITIALIZATION_FAILED;
}
drmVersionPtr version = drmGetVersion(fd);
if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
if (version) {
GGML_ABORT("unknown DRM driver %s version %d", version->name, version->version_major);
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major);
} else {
GGML_ABORT("failed to get DRM driver version");
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to get DRM driver version\n", __func__);
}
if (version) {
@@ -236,7 +292,7 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d
drmFreeVersion(version);
GGML_LOG_INFO("using DRM device %s\n", node_path);
GGML_LOG_INFO(GGML_VIRTGPU "using DRM device %s\n", node_path);
return APIR_SUCCESS;
}
@@ -245,7 +301,7 @@ static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) {
assert(!gpu->capset.version);
const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
if (ret) {
GGML_LOG_INFO("failed to initialize context: %s\n", strerror(errno));
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to initialize context: %s\n", __func__, strerror(errno));
return APIR_ERROR_INITIALIZATION_FAILED;
}
@@ -254,10 +310,10 @@ static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) {
static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
if (gpu->use_apir_capset) {
GGML_LOG_INFO("Using the APIR capset\n");
GGML_LOG_INFO(GGML_VIRTGPU "Using the APIR capset\n");
gpu->capset.id = VIRTGPU_DRM_CAPSET_APIR;
} else {
GGML_LOG_INFO("Using the Venus capset\n");
GGML_LOG_INFO(GGML_VIRTGPU "Using the Venus capset\n");
gpu->capset.id = VIRTGPU_DRM_CAPSET_VENUS;
}
gpu->capset.version = 0;
@@ -266,7 +322,9 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
if (ret) {
GGML_LOG_INFO("failed to get APIR v%d capset: %s\n", gpu->capset.version, strerror(errno));
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to get APIR v%d capset: %s\n",
__func__, gpu->capset.version, strerror(errno));
return APIR_ERROR_INITIALIZATION_FAILED;
}
@@ -333,9 +391,9 @@ apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type,
* Prepare the command encoder and its buffer
*/
static char encoder_buffer[4096];
thread_local char encoder_buffer[4096];
static apir_encoder enc;
thread_local apir_encoder enc;
enc = {
.cur = encoder_buffer,
.start = encoder_buffer,
@@ -369,19 +427,19 @@ void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec) {
UNUSED(gpu);
if (!enc) {
GGML_LOG_ERROR("Invalid (null) encoder\n");
GGML_ABORT(GGML_VIRTGPU "%s: Invalid (null) encoder", __func__);
}
if (!dec) {
GGML_LOG_ERROR("Invalid (null) decoder\n");
GGML_ABORT(GGML_VIRTGPU "%s: Invalid (null) decoder", __func__);
}
if (apir_encoder_get_fatal(enc)) {
GGML_LOG_ERROR("Failed to encode the output parameters.\n");
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Failed to encode the output parameters.", __func__);
}
if (apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR("Failed to decode the input parameters.\n");
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Failed to decode the input parameters.", __func__);
}
}
@@ -423,7 +481,7 @@ uint32_t remote_call(virtgpu * gpu,
int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
if (ret != 0) {
GGML_ABORT("%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret);
GGML_ABORT(GGML_VIRTGPU "%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret);
}
/*
@@ -467,7 +525,7 @@ uint32_t remote_call(virtgpu * gpu,
}
if (max_wait_ms && timedout) {
GGML_LOG_ERROR("timed out waiting for the host answer...\n");
GGML_LOG_ERROR(GGML_VIRTGPU "%s: timed out waiting for the host answer...\n", __func__);
return APIR_FORWARD_TIMEOUT;
}
@@ -489,10 +547,13 @@ static void log_call_duration(long long call_duration_ns, const char * name) {
double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds
if (call_duration_s > 1) {
GGML_LOG_INFO("%s: waited %.2fs for the %s host reply...\n", __func__, call_duration_s, name);
GGML_LOG_INFO(GGML_VIRTGPU
"waited %.2fs for the %s host reply...\n", call_duration_s, name);
} else if (call_duration_ms > 1) {
GGML_LOG_INFO("%s: waited %.2fms for the %s host reply...\n", __func__, call_duration_ms, name);
GGML_LOG_INFO(GGML_VIRTGPU
"waited %.2fms for the %s host reply...\n", call_duration_ms, name);
} else {
GGML_LOG_INFO("%s: waited %lldns for the %s host reply...\n", __func__, call_duration_ns, name);
GGML_LOG_INFO(GGML_VIRTGPU
"waited %lldns for the %s host reply...\n", call_duration_ns, name);
}
}
+23
View File
@@ -17,6 +17,8 @@
#include <cstring>
#include "ggml-remoting.h"
#define VIRGL_RENDERER_UNSTABLE_APIS 1
#include "apir_hw.h"
#include <drm/virtgpu_drm.h>
@@ -73,6 +75,27 @@ struct virtgpu {
/* APIR communication pages */
virtgpu_shmem reply_shmem;
virtgpu_shmem data_shmem;
/* Mutex to protect shared data_shmem buffer from concurrent access */
mtx_t data_shmem_mutex;
/* Cached device information to prevent memory leaks and race conditions */
struct {
char * description;
char * name;
int32_t device_count;
uint32_t type;
size_t memory_free;
size_t memory_total;
} cached_device_info;
/* Cached buffer type information to prevent memory leaks and race conditions */
struct {
apir_buffer_type_host_handle_t host_handle;
char * name;
size_t alignment;
size_t max_size;
} cached_buffer_type;
};
static inline int virtgpu_ioctl(virtgpu * gpu, unsigned long request, void * args) {
+23 -1
View File
@@ -254,6 +254,7 @@ enum vk_device_architecture {
AMD_RDNA3,
INTEL_XE2,
NVIDIA_PRE_TURING,
NVIDIA_TURING,
};
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
@@ -336,18 +337,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
bool cooperative_matrix = false;
bool sm_builtins = false;
// Detect "pre-turing" based on lack of coopmat support.
for (const auto& properties : ext_props) {
if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
cooperative_matrix = true;
break;
} else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) {
sm_builtins = true;
}
}
if (!cooperative_matrix) {
return vk_device_architecture::NVIDIA_PRE_TURING;
}
if (sm_builtins) {
vk::PhysicalDeviceProperties2 props2;
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
props2.pNext = &sm_props;
device.getProperties2(&props2);
// Turing has 32, following architectures have 48
if (sm_props.shaderWarpsPerSM == 32) {
return vk_device_architecture::NVIDIA_TURING;
}
}
}
return vk_device_architecture::OTHER;
}
@@ -8460,6 +8477,11 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
FaCodePath path = ctx->device->coopmat2 ? FA_COOPMAT2 :
ctx->device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
if (path == FA_COOPMAT1 && ctx->device->architecture == vk_device_architecture::NVIDIA_TURING) {
// Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090
path = FA_SCALAR;
}
if (path == FA_COOPMAT1) {
const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) ||
(dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc);
+1 -5
View File
@@ -1027,11 +1027,7 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
llama_sampler_chain_n(sampler) > 0;
if (sampler && can_offload) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output());
auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output());
if (host_buft) {
buft = host_buft;
}
auto * buft = ggml_backend_dev_buffer_type(model.dev_output());
sampler->iface->backend_init(sampler, buft);
+13 -6
View File
@@ -2419,6 +2419,9 @@ void llm_graph_context::build_sampling() const {
return;
}
std::array<ggml_tensor *, 2> outs;
outs[0] = res->t_logits;
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
res->add_input(std::move(inp_sampling));
@@ -2439,14 +2442,14 @@ void llm_graph_context::build_sampling() const {
// add a dummy row of logits
// this trick makes the graph static, regardless of which samplers are activated
// this is important in order to minimize graph reallocations
// TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
for (const auto & [seq_id, sampler] : samplers) {
const auto it = seq_to_logit_row.find(seq_id);
// inactive samplers always work on the first row
const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
const auto row_idx = it != seq_to_logit_row.end() ? it->second : 0;
const int i_out = it != seq_to_logit_row.end() ? 1 : 0;
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
@@ -2463,22 +2466,26 @@ void llm_graph_context::build_sampling() const {
if (data.sampled != nullptr) {
res->t_sampled[seq_id] = data.sampled;
ggml_build_forward_expand(gf, data.sampled);
outs[1] = data.sampled;
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
}
if (data.probs != nullptr) {
res->t_sampled_probs[seq_id] = data.probs;
ggml_build_forward_expand(gf, data.probs);
outs[1] = data.probs;
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
}
if (data.logits != nullptr) {
res->t_sampled_logits[seq_id] = data.logits;
ggml_build_forward_expand(gf, data.logits);
outs[1] = data.logits;
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
}
if (data.candidates != nullptr) {
res->t_candidates[seq_id] = data.candidates;
ggml_build_forward_expand(gf, data.candidates);
outs[1] = data.candidates;
ggml_build_forward_select(gf, outs.data(), outs.size(), i_out);
}
}
+19 -62
View File
@@ -1025,11 +1025,7 @@ struct llama_sampler_dist : public llama_sampler_backend {
std::mt19937 rng;
// backend input
struct ggml_tensor * inp_uniform;
ggml_context_ptr inp_ctx;
ggml_backend_buffer_ptr inp_buf;
ggml_tensor * inp_uniform;
};
static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) {
@@ -1138,37 +1134,10 @@ static bool llama_sampler_dist_backend_init(
ggml_backend_buffer_type_t buft) {
auto * sctx = (llama_sampler_dist *) smpl->ctx;
// allocate inputs
{
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead(),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
sctx->inp_ctx.reset(ggml_init(params));
// Create the uniform random scalar input tensor. This will be set by
// llama_sampler_dist_backend_set_input after this graph is built.
sctx->inp_uniform = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1);
ggml_set_name (sctx->inp_uniform, "uniform");
ggml_set_input(sctx->inp_uniform);
// Allocate all tensors from our context to the backend
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
}
const bool res = llama_sampler_backend_support(smpl, buft);
sctx->init(res);
if (!res) {
sctx->inp_ctx.reset(nullptr);
sctx->inp_buf.reset(nullptr);
}
return res;
}
@@ -1178,8 +1147,13 @@ static void llama_sampler_dist_backend_apply(
struct ggml_cgraph * gf,
struct llama_sampler_data * data) {
GGML_UNUSED(gf);
auto * sctx = (llama_sampler_dist *) smpl->ctx;
sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_set_name (sctx->inp_uniform, "uniform");
ggml_set_input(sctx->inp_uniform);
struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
ggml_set_name(probs, "dist_probs");
@@ -1226,6 +1200,7 @@ static void llama_sampler_dist_backend_apply(
static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) {
auto * sctx = (llama_sampler_dist *) smpl->ctx;
GGML_ASSERT(sctx->inp_uniform != nullptr);
// We sample in double precision and cast to float to match rnd numbers of
@@ -1262,8 +1237,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
/* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur),
/* .inp_uniform = */ nullptr,
/* .inp_ctx = */ nullptr,
/* .inp_buf = */ nullptr,
}
);
}
@@ -3461,9 +3434,6 @@ struct llama_sampler_logit_bias : public llama_sampler_backend {
struct ggml_tensor * inp_logit_bias;
struct ggml_tensor * inp_logit_idxs;
ggml_context_ptr inp_ctx;
ggml_backend_buffer_ptr inp_buf;
};
static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) {
@@ -3526,6 +3496,16 @@ static void llama_sampler_logit_bias_backend_apply(
return;
}
const size_t n = sctx->logit_bias.size();
sctx->inp_logit_bias = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n);
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
ggml_set_input(sctx->inp_logit_bias);
sctx->inp_logit_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n);
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
ggml_set_input(sctx->inp_logit_idxs);
ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f);
cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur));
@@ -3562,6 +3542,8 @@ static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * sm
static bool llama_sampler_logit_bias_backend_init(
struct llama_sampler * smpl,
ggml_backend_buffer_type_t buft) {
GGML_UNUSED(buft);
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
sctx->init(true);
@@ -3570,29 +3552,6 @@ static bool llama_sampler_logit_bias_backend_init(
return true;
}
ggml_init_params params = {
/*.mem_size =*/ 2*ggml_tensor_overhead(),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
sctx->inp_ctx.reset(ggml_init(params));
const size_t n = sctx->logit_bias.size();
sctx->inp_logit_bias = ggml_new_tensor_2d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1, n);
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
ggml_set_input(sctx->inp_logit_bias);
sctx->inp_logit_idxs = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_I32, n);
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
ggml_set_input(sctx->inp_logit_idxs);
// Allocate all tensors from our context to the backend
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
return true;
}
@@ -3628,8 +3587,6 @@ struct llama_sampler * llama_sampler_init_logit_bias(
/* .to_search = */ {},
/* .inp_logit_bias = */ nullptr,
/* .inp_logit_idxs = */ nullptr,
/* .inp_ctx = */ nullptr,
/* .inp_buf = */ nullptr,
}
);
}
+5 -8
View File
@@ -674,15 +674,12 @@ int main(int argc, char ** argv) {
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (!embd.empty()) {
int n_eval = (int) embd.size();
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
GGML_ASSERT(n_eval <= params.n_batch);
if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
@@ -743,7 +740,7 @@ int main(int argc, char ** argv) {
common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
if ((int) embd.size() == params.n_batch) {
break;
}
}