mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
ggml-cpu: Add IME2 Instruction Support for the SpacemiT Backend (#22863)
This commit is contained in:
@@ -301,16 +301,17 @@ jobs:
|
||||
export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DGGML_CPU_REPACK=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZVFH=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DGGML_RV_ZBA=ON \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
|
||||
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
|
||||
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
|
||||
|
||||
@@ -9,18 +9,20 @@ wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_6
|
||||
~~~
|
||||
|
||||
2. Build
|
||||
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
|
||||
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1` and `RISCV64_SPACEMIT_IME2`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
|
||||
```bash
|
||||
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
-DGGML_CPU_REPACK=OFF \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZVFH=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DGGML_RV_ZBA=ON \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=build/installed
|
||||
|
||||
@@ -47,8 +49,25 @@ export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
|
||||
|
||||
${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
|
||||
~~~
|
||||
|
||||
## Quantization Support For Matrix
|
||||
|
||||
| Quantization Type | X60 | A100 |
|
||||
| ---: | ---: | ---: |
|
||||
| Q2_K | | :heavy_check_mark: |
|
||||
| Q3_K | | :heavy_check_mark: |
|
||||
| Q4_0 | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Q4_1 | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Q4_K | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Q5_0 | | :heavy_check_mark: |
|
||||
| Q5_1 | | :heavy_check_mark: |
|
||||
| Q5_K | | :heavy_check_mark: |
|
||||
| Q6_K | | :heavy_check_mark: |
|
||||
| Q8_0 | | :heavy_check_mark: |
|
||||
|
||||
|
||||
## Performance
|
||||
#### Quantization Support For Matrix
|
||||
* Spacemit(R) X60
|
||||
~~~
|
||||
model name : Spacemit(R) X60
|
||||
isa : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
|
||||
@@ -58,33 +77,34 @@ mvendorid : 0x710
|
||||
marchid : 0x8000000058000001
|
||||
~~~
|
||||
|
||||
Q4_0
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | pp512|64.12 ± 0.26|
|
||||
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | tg128|10.03 ± 0.01|
|
||||
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | pp512|24.16 ± 0.02|
|
||||
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | tg128|3.83 ± 0.06|
|
||||
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | pp512|12.08 ± 0.02|
|
||||
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | tg128|2.23 ± 0.02|
|
||||
|
||||
Q4_1
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | pp512|62.07 ± 0.12|
|
||||
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | tg128|9.91 ± 0.01|
|
||||
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | pp512|22.95 ± 0.25|
|
||||
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | tg128|4.01 ± 0.15|
|
||||
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | pp512|11.55 ± 0.16|
|
||||
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | tg128|2.25 ± 0.04|
|
||||
| model | size | params | backend | threads | n_ubatch | fa | mmap | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 4 | 128 | 1 | 0 | pp128 | 10.32 ± 0.02 |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 4 | 128 | 1 | 0 | tg128 | 3.07 ± 0.01 |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 4 | 128 | 1 | 0 | pp128 | 49.15 ± 0.25 |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 4 | 128 | 1 | 0 | tg128 | 11.73 ± 0.02 |
|
||||
|
||||
|
||||
Q4_K
|
||||
| Model | Size | Params | backend | threads | test | t/s |
|
||||
| -----------| -------- | ------ | ------- | ------- | ---- |------|
|
||||
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | pp512|9.29 ± 0.05|
|
||||
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | tg128|5.67 ± 0.04|
|
||||
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | pp512|10.38 ± 0.10|
|
||||
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | tg128|3.17 ± 0.08|
|
||||
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | pp512|4.23 ± 0.04|
|
||||
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | tg128|1.73 ± 0.00|
|
||||
* Spacemit(R) A100
|
||||
~~~
|
||||
model name : Spacemit(R) A100
|
||||
isa : rv64imafdcvh_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
|
||||
mmu : sv39
|
||||
mvendorid : 0x710
|
||||
marchid : 0x8000000041000002
|
||||
mimpid : 0x10000000d5686200
|
||||
hart isa : rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
|
||||
~~~
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | mmap | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 8 | 128 | 1 | 0 | pp128 | 565.83 ± 0.31 |
|
||||
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 8 | 128 | 1 | 0 | tg128 | 55.77 ± 0.02 |
|
||||
| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | CPU | 8 | 128 | 1 | 0 | pp128 | 79.74 ± 0.04 |
|
||||
| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | CPU | 8 | 128 | 1 | 0 | tg128 | 11.29 ± 0.00 |
|
||||
| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CPU | 8 | 128 | 1 | 0 | pp128 | 57.88 ± 0.31 |
|
||||
| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CPU | 8 | 128 | 1 | 0 | tg128 | 12.79 ± 0.00 |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 8 | 128 | 1 | 0 | pp128 | 115.23 ± 0.04 |
|
||||
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 8 | 128 | 1 | 0 | tg128 | 16.49 ± 0.01 |
|
||||
| gemma4 E4B Q4_K - Medium | 4.76 GiB | 7.52 B | CPU | 8 | 128 | 1 | 0 | pp128 | 21.13 ± 0.01 |
|
||||
| gemma4 E4B Q4_K - Medium | 4.76 GiB | 7.52 B | CPU | 8 | 128 | 1 | 0 | tg128 | 5.66 ± 0.00 |
|
||||
|
||||
@@ -450,12 +450,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
ggml-cpu/arch/riscv/repack.cpp
|
||||
)
|
||||
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||
include(ggml-cpu/cmake/FindSMTIME.cmake)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/spacemit/ime.cpp
|
||||
ggml-cpu/spacemit/ime.h
|
||||
ggml-cpu/spacemit/spine_mem_pool.cpp
|
||||
ggml-cpu/spacemit/spine_mem_pool.h
|
||||
ggml-cpu/spacemit/repack.cpp
|
||||
ggml-cpu/spacemit/repack.h
|
||||
ggml-cpu/spacemit/ime_env.cpp
|
||||
ggml-cpu/spacemit/ime_env.h
|
||||
ggml-cpu/spacemit/ime1_kernels.cpp
|
||||
ggml-cpu/spacemit/ime2_kernels.cpp
|
||||
ggml-cpu/spacemit/ime_kernels.h
|
||||
ggml-cpu/spacemit/rvv_kernels.cpp
|
||||
ggml-cpu/spacemit/rvv_kernels.h
|
||||
)
|
||||
endif()
|
||||
if(NOT GGML_CPU_ALL_VARIANTS)
|
||||
@@ -485,6 +495,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_RV_ZIHINTPAUSE)
|
||||
string(APPEND MARCH_STR "_zihintpause")
|
||||
endif()
|
||||
if (GGML_RV_ZBA)
|
||||
string(APPEND MARCH_STR "_zba")
|
||||
endif()
|
||||
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||
# `xsmtvdotii' is only required for GCC >= 15.
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
include(CheckCSourceRuns)
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)" AND GGML_CPU_RISCV64_SPACEMIT)
|
||||
set(SMT_MARCH_STR "-march=rv64gcv_zfh_zvfh_zba_zicbop")
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
|
||||
string(APPEND SMT_MARCH_STR "_xsmtvdotii")
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_FLAGS "${SMT_MARCH_STR}")
|
||||
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S8)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vfwmadot v2, v0, v1, fp16\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFWMADOT_FP16)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S4)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S8)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot1 v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOTN)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vpack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK)
|
||||
check_c_source_compiles("int main() {__asm__ volatile(\"vnspack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
|
||||
unset(CMAKE_REQUIRED_FLAGS)
|
||||
|
||||
list(APPEND RISCV64_SPACEMIT_IME_SPEC "")
|
||||
if (SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
|
||||
set(RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME1")
|
||||
endif()
|
||||
|
||||
if (SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4 AND SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK AND SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
|
||||
list(APPEND RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME2")
|
||||
endif()
|
||||
|
||||
message("RISCV64_SPACEMIT_IME_SPEC: ${RISCV64_SPACEMIT_IME_SPEC}")
|
||||
endif()
|
||||
@@ -50,6 +50,10 @@
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
# include "spacemit/ime.h"
|
||||
#endif
|
||||
|
||||
// Note: once we move threading into a separate C++ file
|
||||
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
||||
// and we'll use C++ attribute syntax.
|
||||
@@ -3011,7 +3015,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
const struct ggml_cgraph * cgraph = tp->cgraph;
|
||||
const struct ggml_cplan * cplan = tp->cplan;
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(state->ith);
|
||||
#else
|
||||
set_numa_thread_affinity(state->ith);
|
||||
#endif
|
||||
|
||||
struct ggml_compute_params params = {
|
||||
/*.ith =*/ state->ith,
|
||||
@@ -3068,6 +3076,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_barrier(state->threadpool);
|
||||
|
||||
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
||||
ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(state->ith);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
+1438
-723
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,14 @@ extern "C" {
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(int thread_n);
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(int thread_n);
|
||||
|
||||
void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment);
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,320 @@
|
||||
#include "ime_env.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "spine_mem_pool.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cctype>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
bool spine_core_info::get_spine_core_info(std::vector<spine_core_info> & result) {
|
||||
static std::unordered_map<uint64_t, spine_core_arch_id> spine_march_mapping_ = {
|
||||
{0x8000000058000001, spine_core_arch_id::core_arch_x60 },
|
||||
{ 0x8000000041000001, spine_core_arch_id::core_arch_a60 },
|
||||
{ 0x8000000058000002, spine_core_arch_id::core_arch_x100},
|
||||
{ 0x8000000041000002, spine_core_arch_id::core_arch_a100},
|
||||
};
|
||||
|
||||
result.clear();
|
||||
std::ifstream file("/proc/cpuinfo");
|
||||
std::string line;
|
||||
|
||||
std::vector<std::array<uint64_t, 2>> cpu_info_list;
|
||||
|
||||
uint64_t current_processor = spine_invalid_core_id;
|
||||
uint64_t current_marchid = 0;
|
||||
bool has_processor = false;
|
||||
bool has_marchid = false;
|
||||
|
||||
if (!file.is_open()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
while (std::getline(file, line)) {
|
||||
if (line.substr(0, 9) == "processor") {
|
||||
if (has_processor && has_marchid) {
|
||||
cpu_info_list.push_back({ current_processor, current_marchid });
|
||||
}
|
||||
|
||||
size_t colon_pos = line.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
current_processor = std::stoi(line.substr(colon_pos + 1));
|
||||
has_processor = true;
|
||||
}
|
||||
|
||||
has_marchid = false;
|
||||
} else if (line.substr(0, 7) == "marchid") {
|
||||
size_t colon_pos = line.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
std::string marchid_str = line.substr(colon_pos + 1);
|
||||
marchid_str.erase(std::remove_if(marchid_str.begin(), marchid_str.end(), isspace), marchid_str.end());
|
||||
current_marchid = std::stoull(marchid_str, nullptr, 16);
|
||||
has_marchid = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (has_processor && has_marchid) {
|
||||
cpu_info_list.push_back({ current_processor, current_marchid });
|
||||
}
|
||||
|
||||
if (has_processor && has_marchid) {
|
||||
for (auto & cpu_info : cpu_info_list) {
|
||||
if (cpu_info[0] != spine_invalid_core_id &&
|
||||
spine_march_mapping_.find(cpu_info[1]) != spine_march_mapping_.end()) {
|
||||
auto core_info = spine_core_info();
|
||||
core_info.core_id = cpu_info[0];
|
||||
core_info.arch_id = spine_core_arch_id(spine_march_mapping_[cpu_info[1]]);
|
||||
|
||||
result.push_back(core_info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return has_processor && has_marchid;
|
||||
}
|
||||
|
||||
namespace {
|
||||
uint16_t hex_string_to_u16(const std::string & hex_str) {
|
||||
try {
|
||||
size_t pos = 0;
|
||||
if (hex_str.substr(0, 2) == "0x" || hex_str.substr(0, 2) == "0X") {
|
||||
pos = 2;
|
||||
}
|
||||
unsigned long result = std::stoul(hex_str.substr(pos), nullptr, 16);
|
||||
if (result > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::out_of_range("Converted value is out of range for uint16_t");
|
||||
}
|
||||
return static_cast<uint16_t>(result);
|
||||
} catch (const std::invalid_argument & e) {
|
||||
throw std::invalid_argument("Invalid hexadecimal string");
|
||||
} catch (const std::out_of_range & e) {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
const char * spine_mem_pool_backend_to_string(spine_mem_pool_backend backend) {
|
||||
switch (backend) {
|
||||
case spine_mem_pool_backend::none:
|
||||
return "NONE";
|
||||
case spine_mem_pool_backend::posix_memalign:
|
||||
return "POSIX";
|
||||
case spine_mem_pool_backend::transparent_hugepage:
|
||||
return "HPAGE";
|
||||
case spine_mem_pool_backend::hugetlb_1g:
|
||||
return "HPAGE1GB";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
spine_mem_pool_backend parse_mem_backend(const char * mem_backend_str) {
|
||||
if (mem_backend_str == nullptr || mem_backend_str[0] == '\0') {
|
||||
return spine_mem_pool_backend::transparent_hugepage;
|
||||
}
|
||||
|
||||
std::string value(mem_backend_str);
|
||||
std::transform(value.begin(), value.end(), value.begin(),
|
||||
[](unsigned char ch) { return static_cast<char>(std::tolower(ch)); });
|
||||
|
||||
if (value == "none") {
|
||||
return spine_mem_pool_backend::none;
|
||||
}
|
||||
|
||||
if (value == "posix") {
|
||||
return spine_mem_pool_backend::posix_memalign;
|
||||
}
|
||||
|
||||
if (value == "hpage") {
|
||||
return spine_mem_pool_backend::transparent_hugepage;
|
||||
}
|
||||
|
||||
if (value == "hpage1gb") {
|
||||
return spine_mem_pool_backend::hugetlb_1g;
|
||||
}
|
||||
|
||||
throw std::runtime_error("invalid SPACEMIT_MEM_BACKEND: " + value + ", expected NONE, POSIX, HPAGE or HPAGE1GB");
|
||||
}
|
||||
} // namespace
|
||||
|
||||
spine_env_info::spine_env_info() {
|
||||
num_cores = static_cast<int>(std::thread::hardware_concurrency());
|
||||
spine_core_info::get_spine_core_info(core_info_list);
|
||||
|
||||
// special for x60 K1
|
||||
if (core_info_list.size() == 8 && core_info_list[0].arch_id == spine_core_arch_id::core_arch_x60) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
core_info_list[i].arch_id = spine_core_arch_id::core_arch_a60;
|
||||
}
|
||||
}
|
||||
|
||||
// special for qemu
|
||||
if (core_info_list.size() == 0) {
|
||||
char * spine_core_arch_str = getenv("SPACEMIT_CORE_ARCH");
|
||||
if (spine_core_arch_str != nullptr) {
|
||||
auto arch_id = hex_string_to_u16(spine_core_arch_str);
|
||||
for (int i = 0; i < num_cores; i++) {
|
||||
auto core_info = spine_core_info();
|
||||
core_info.core_id = i;
|
||||
core_info.arch_id = spine_core_arch_id{ arch_id };
|
||||
core_info_list.push_back(core_info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (core_info_list.size() == 0) {
|
||||
throw std::runtime_error(
|
||||
"Failed to get SPACEMIT_CORE_ARCH from environment or failed to parse it from /proc/cpuinfo");
|
||||
}
|
||||
|
||||
char * spine_perfer_core_arch_str = getenv("SPACEMIT_PERFER_CORE_ARCH");
|
||||
if (spine_perfer_core_arch_str != nullptr && spine_perfer_core_arch_str != "") {
|
||||
perfer_core_arch_id = spine_core_arch_id{ hex_string_to_u16(spine_perfer_core_arch_str) };
|
||||
}
|
||||
|
||||
char * spine_perfer_core_id_str = getenv("SPACEMIT_PERFER_CORE_ID");
|
||||
std::vector<int> perfer_core_id_vec;
|
||||
if (spine_perfer_core_id_str != nullptr && spine_perfer_core_id_str != "") {
|
||||
std::string perfer_core_id_str(spine_perfer_core_id_str);
|
||||
size_t start = 0;
|
||||
size_t end = 0;
|
||||
while ((end = perfer_core_id_str.find(',', start)) != std::string::npos) {
|
||||
std::string core_id_substr = perfer_core_id_str.substr(start, end - start);
|
||||
perfer_core_id_vec.push_back(std::stoi(core_id_substr));
|
||||
start = end + 1;
|
||||
}
|
||||
std::string core_id_substr = perfer_core_id_str.substr(start);
|
||||
perfer_core_id_vec.push_back(std::stoi(core_id_substr));
|
||||
}
|
||||
|
||||
perfer_core_ids.reserve(num_cores);
|
||||
if (perfer_core_arch_id == spine_core_arch_id::core_arch_none) {
|
||||
for (auto & core_info : core_info_list) {
|
||||
auto core_arch_id = core_info.arch_id;
|
||||
auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
|
||||
if (core_arch_head == 0xA) {
|
||||
num_perfer_cores++;
|
||||
perfer_core_arch_id = core_arch_id;
|
||||
cpu_mask |= (1ULL << core_info.core_id);
|
||||
perfer_core_ids.push_back(core_info.core_id);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto & core_info : core_info_list) {
|
||||
auto core_arch_id = core_info.arch_id;
|
||||
if (core_arch_id == perfer_core_arch_id) {
|
||||
num_perfer_cores++;
|
||||
cpu_mask |= (1ULL << core_info.core_id);
|
||||
|
||||
auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
|
||||
if (core_arch_head == 0xA) {
|
||||
perfer_core_ids.push_back(core_info.core_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (num_perfer_cores == 0) {
|
||||
GGML_ABORT("can not find core with arch id %x for SPACEMIT_PERFER_CORE_ARCH in core info list\n",
|
||||
(uint16_t) perfer_core_arch_id);
|
||||
}
|
||||
}
|
||||
|
||||
if (perfer_core_id_vec.size() > 0) {
|
||||
perfer_core_ids.clear();
|
||||
cpu_mask = 0;
|
||||
num_perfer_cores = 0;
|
||||
for (int core_id : perfer_core_id_vec) {
|
||||
if (core_id < 0 || core_id >= num_cores) {
|
||||
GGML_ABORT("invalid core id in SPACEMIT_PERFER_CORE_ID: %d, should be between 0 and %d\n", core_id,
|
||||
num_cores - 1);
|
||||
}
|
||||
auto core_info = core_info_list[core_id];
|
||||
auto core_arch_id = core_info.arch_id;
|
||||
if (core_arch_id == perfer_core_arch_id) {
|
||||
cpu_mask |= (1ULL << core_id);
|
||||
perfer_core_ids.push_back(core_id);
|
||||
} else {
|
||||
GGML_ABORT(
|
||||
"core id %d in SPACEMIT_PERFER_CORE_ID has arch id %x which does not match "
|
||||
"SPACEMIT_PERFER_CORE_ARCH %x\n",
|
||||
core_id, (uint16_t) core_arch_id, (uint16_t) perfer_core_arch_id);
|
||||
}
|
||||
}
|
||||
std::string perfer_core_id_vec_str;
|
||||
for (int core_id : perfer_core_id_vec) {
|
||||
perfer_core_id_vec_str += std::to_string(core_id) + ",";
|
||||
}
|
||||
perfer_core_id_vec_str.pop_back();
|
||||
GGML_LOG_DEBUG("SPACEMIT_PERFER_CORE_ID is set, perferred core ids: %s\n", perfer_core_id_vec_str.c_str());
|
||||
num_perfer_cores = static_cast<int>(perfer_core_id_vec.size());
|
||||
}
|
||||
|
||||
use_ime1 = perfer_core_arch_id == spine_core_arch_id::core_arch_a60 ||
|
||||
perfer_core_arch_id == spine_core_arch_id::core_arch_x100;
|
||||
|
||||
use_ime2 = perfer_core_arch_id == spine_core_arch_id::core_arch_a100;
|
||||
|
||||
mem_backend = parse_mem_backend(getenv("SPACEMIT_MEM_BACKEND"));
|
||||
char * spine_disable_tcm_str = getenv("SPACEMIT_DISABLE_TCM");
|
||||
auto user_disable_tcm = spine_disable_tcm_str != nullptr && strcmp(spine_disable_tcm_str, "0") != 0;
|
||||
|
||||
if (!user_disable_tcm) {
|
||||
spine_mem_pool_tcm_info tcm_info;
|
||||
if (spine_mem_pool_tcm_init(&tcm_info)) {
|
||||
use_tcm = tcm_info.available;
|
||||
tcm_blk_size = tcm_info.blk_size;
|
||||
GGML_LOG_DEBUG("CPU_RISCV64_SPACEMIT: tcm is available, blk_size: %zu, blk_num: %zu, is_fake_tcm: %d\n",
|
||||
tcm_info.blk_size, tcm_info.blk_num, tcm_info.is_fake_tcm);
|
||||
|
||||
for (auto & core_info : core_info_list) {
|
||||
auto core_arch_head = (uint16_t) (core_info.arch_id) >> 12;
|
||||
if (core_arch_head != 0xA) {
|
||||
aicpu_id_offset++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_DEBUG(
|
||||
"CPU_RISCV64_SPACEMIT: num_cores: %d, num_perfer_cores: %d, perfer_core_arch_id: %x, exclude_main_thread: %d, "
|
||||
"use_ime1: %d, use_ime2: %d, mem_backend: %s, cpu_mask: %lx, aicpu_id_offset: %d\n",
|
||||
num_cores, num_perfer_cores, (uint16_t) perfer_core_arch_id, exclude_main_thread, use_ime1, use_ime2,
|
||||
spine_mem_pool_backend_to_string(mem_backend), cpu_mask, aicpu_id_offset);
|
||||
|
||||
const size_t init_barrier_size = sizeof(spine_barrier_t) * spine_init_barrier_count;
|
||||
init_barrier =
|
||||
static_cast<spine_barrier_t *>(spine_mem_pool_shared_mem_alloc(init_barrier_size, alignof(spine_barrier_t)));
|
||||
if (init_barrier != nullptr) {
|
||||
init_barrier_is_shared_mem = true;
|
||||
} else {
|
||||
GGML_LOG_WARN("CPU_RISCV64_SPACEMIT: failed to allocate init_barrier from shared mem, falling back to heap\n",
|
||||
__func__);
|
||||
init_barrier = new spine_barrier_t[spine_init_barrier_count];
|
||||
}
|
||||
|
||||
spine_barrier_init(init_barrier, spine_init_barrier_count, 2);
|
||||
}
|
||||
|
||||
spine_env_info::~spine_env_info() {
|
||||
if (init_barrier_is_shared_mem) {
|
||||
spine_mem_pool_shared_mem_free(init_barrier);
|
||||
} else {
|
||||
delete[] init_barrier;
|
||||
}
|
||||
|
||||
init_barrier = nullptr;
|
||||
init_barrier_is_shared_mem = false;
|
||||
}
|
||||
|
||||
spine_env_info global_spine_env_info;
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
@@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
|
||||
#include "spine_barrier.h"
|
||||
#include "spine_mem_pool.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
|
||||
constexpr uint64_t spine_invalid_core_id = 0xFFFFFFFF;
|
||||
constexpr size_t spine_init_barrier_count = 16;
|
||||
|
||||
enum class spine_core_arch_id : uint16_t {
|
||||
core_arch_none = 0,
|
||||
core_arch_x60 = 0x503C,
|
||||
core_arch_x100 = 0x5064,
|
||||
core_arch_x200 = 0x50C8,
|
||||
core_arch_a60 = 0xA03C,
|
||||
core_arch_a100 = 0xA064,
|
||||
core_arch_a200 = 0xA0C8,
|
||||
};
|
||||
|
||||
struct spine_core_info {
|
||||
uint64_t core_id{ spine_invalid_core_id };
|
||||
spine_core_arch_id arch_id{ spine_core_arch_id::core_arch_none };
|
||||
|
||||
static bool get_spine_core_info(std::vector<spine_core_info> & result);
|
||||
};
|
||||
|
||||
struct spine_env_info {
|
||||
std::vector<spine_core_info> core_info_list;
|
||||
std::vector<int> perfer_core_ids;
|
||||
int aicpu_id_offset{ 0 };
|
||||
int num_cores{ 0 };
|
||||
int num_perfer_cores{ 0 };
|
||||
spine_core_arch_id perfer_core_arch_id{ spine_core_arch_id::core_arch_none };
|
||||
bool exclude_main_thread{ false };
|
||||
bool use_ime2{ false };
|
||||
bool use_ime1{ false };
|
||||
bool use_tcm{ false };
|
||||
spine_mem_pool_backend mem_backend{ spine_mem_pool_backend::transparent_hugepage };
|
||||
uint64_t tcm_blk_size{ 0 };
|
||||
uint64_t cpu_mask{ 0 };
|
||||
spine_barrier_t * init_barrier{ nullptr };
|
||||
bool init_barrier_is_shared_mem{ false };
|
||||
|
||||
spine_env_info();
|
||||
~spine_env_info();
|
||||
};
|
||||
|
||||
extern spine_env_info global_spine_env_info;
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
@@ -1,26 +1,189 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
|
||||
namespace spacemit_kernels {
|
||||
|
||||
#define BLOCK_QNK_LEN 256
|
||||
|
||||
template <int N> struct nrow_block_q2_k {
|
||||
// [4bit scale + 4bit zp] * N * 16
|
||||
uint8_t scales[N * BLOCK_QNK_LEN / 16];
|
||||
// [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
|
||||
// [b64, b80, b96, b112] ...[b79, b95, b111, b127]
|
||||
// [b128, b144, b160, b176] ...[b143, b159, b175, b191]
|
||||
// [b192, b208, b224, b240] ...[b207, b223, b239, b255]
|
||||
uint8_t qs[N * BLOCK_QNK_LEN / 4];
|
||||
uint16_t scales16[N];
|
||||
uint16_t zeros16[N];
|
||||
};
|
||||
|
||||
template <int N> struct nrow_block_q3_k {
|
||||
// [8bit scale] * N * 16
|
||||
int8_t scales[N * 16];
|
||||
// [b0, b1, b2, b3, b4, b5, b6, b7] ... [b248, b249, b250, b251, b252, b253, b254, b255]
|
||||
uint8_t hmask[N * BLOCK_QNK_LEN / 8];
|
||||
// [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
|
||||
// [b64, b80, b96, b112] ...[b79, b95, b111, b127]
|
||||
// [b128, b144, b160, b176] ...[b143, b159, b175, b191]
|
||||
// [b192, b208, b224, b240] ...[b207, b223, b239, b255]
|
||||
uint8_t qs[N * BLOCK_QNK_LEN / 4];
|
||||
uint16_t scales16[N];
|
||||
};
|
||||
|
||||
template <int N> struct nrow_block_mxfp4 {
|
||||
uint8_t e[N];
|
||||
uint8_t qh[4 * N];
|
||||
uint8_t qs[16 * N];
|
||||
};
|
||||
|
||||
template <int N> struct __attribute__((packed)) nrow_block_q5_1 {
|
||||
uint16_t scales16[N];
|
||||
uint8_t zp[N];
|
||||
// n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
|
||||
uint8_t qh[4 * N];
|
||||
// n0 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
// n1 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
uint8_t qs[16 * N];
|
||||
};
|
||||
|
||||
static_assert(sizeof(nrow_block_q5_1<1>) == sizeof(uint8_t) + 22, "wrong nrow_block_q5_1 block size/padding");
|
||||
|
||||
template <int N> struct __attribute__((packed)) nrow_block_q5_0 {
|
||||
uint16_t scales16[N];
|
||||
// n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
|
||||
uint8_t qh[4 * N];
|
||||
// n0 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
// n1 [b0, b1], [b2, b3] .... [b30, b31]
|
||||
uint8_t qs[16 * N];
|
||||
};
|
||||
|
||||
static_assert(sizeof(nrow_block_q5_0<1>) == 22, "wrong nrow_block_q5_0 block size/padding");
|
||||
|
||||
using gemm_kernel_quantize_def = std::function<
|
||||
size_t(size_t, const uint8_t *, const uint8_t *, const uint8_t *, float *, size_t, size_t, size_t, size_t)>;
|
||||
|
||||
using moe_gemm_kernel_quantize_def = std::function<
|
||||
size_t(size_t, const uint8_t **, const uint8_t *, const uint8_t *, float **, size_t, size_t, size_t, size_t)>;
|
||||
|
||||
namespace sqnbitgemm_spacemit_ime {
|
||||
namespace ime1 {
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const std::byte * quant_a_ptr,
|
||||
const std::byte * quant_b_data,
|
||||
const float * quant_b_scale,
|
||||
const std::byte * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t count_k,
|
||||
size_t block_count_k,
|
||||
size_t ldc,
|
||||
const float * bias,
|
||||
const size_t scale_stride);
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
} // namespace ime1
|
||||
} // namespace sqnbitgemm_spacemit_ime
|
||||
|
||||
namespace ime2 {
|
||||
size_t gemm_kernel_i8i2k(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i3k(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i4(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i4_hp(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t moe_m2_gemm_kernel_i8i4(size_t blk_len,
|
||||
const uint8_t ** quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float ** c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i8(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8mxfp4(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t moe_m2_gemm_kernel_i8mxfp4(size_t blk_len,
|
||||
const uint8_t ** quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float ** c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t gemm_kernel_i8i5(size_t blk_len,
|
||||
const uint8_t * quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float * c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
|
||||
size_t moe_m2_gemm_kernel_i8i5(size_t blk_len,
|
||||
const uint8_t ** quant_a_ptr,
|
||||
const uint8_t * quant_b_data,
|
||||
const uint8_t * quant_b_zp,
|
||||
float ** c_ptr,
|
||||
size_t count_m,
|
||||
size_t count_n,
|
||||
size_t k_blks,
|
||||
size_t ldc);
|
||||
} // namespace ime2
|
||||
} // namespace spacemit_kernels
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,14 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
int repack(ggml_tensor * t, const void * data, size_t data_size);
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,95 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
|
||||
namespace spacemit_kernels {
|
||||
|
||||
constexpr auto div_round_up(auto up, auto down) {
|
||||
return (up + down - 1) / down;
|
||||
}
|
||||
|
||||
// Q8 Blk [f32] [s16] [int8 * blk_len]
|
||||
// Q8 Blk N [f32 * N] [s16 * N] [int8 * blk_len * N]
|
||||
constexpr size_t q8_blk_size(size_t blk_len, bool with_blk_sum = false) {
|
||||
const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + (with_blk_sum ? sizeof(int16_t) : 0);
|
||||
return blk_size;
|
||||
}
|
||||
|
||||
// Q8 HP row block: K is split into K32 subblocks.
|
||||
// Each subblock stores [f32 scale] [int8 * 32], with an optional fp16 sum trailer per subblock.
|
||||
constexpr size_t q8_hp_blk_size(size_t blk_len, bool with_blk_sum = false, bool with_blk_scale = false) {
|
||||
const size_t subblk_count = div_round_up(blk_len, size_t(32));
|
||||
const size_t blk_size = blk_len * sizeof(int8_t) + subblk_count * sizeof(_Float16) +
|
||||
(with_blk_sum ? subblk_count * sizeof(_Float16) : 0) +
|
||||
(with_blk_scale ? sizeof(_Float16) : 0);
|
||||
return blk_size;
|
||||
}
|
||||
|
||||
// Q8K Blk [f32] [s16 * (blk_len / 16)] [int8 * blk_len]
|
||||
// Q8K Blk N [f32 * N] [s16 * (blk_len / 16) * N] [int8 * blk_len * N]
|
||||
constexpr size_t q8k_blk_size(size_t blk_len) {
|
||||
const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + sizeof(int16_t) * blk_len / 16;
|
||||
return blk_size;
|
||||
}
|
||||
|
||||
using quantize_a_row_def = std::function<void(size_t, const float *, size_t, uint8_t *)>;
|
||||
|
||||
namespace rvv {
|
||||
void memcpy1d(void * dst, const void * src, int64_t size);
|
||||
|
||||
void memcpy2d(void * dst, int64_t dst_stride, const void * src, int64_t src_stride, int64_t tile_rows, int64_t size);
|
||||
|
||||
void forward_flash_attn_ext_f16_one_chunk_vlen1024_vf16(const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
int ir0,
|
||||
int ir1,
|
||||
void * tcm_buffer,
|
||||
size_t tcm_buffer_size);
|
||||
|
||||
void forward_flash_attn_ext_f16_tiled_vlen1024_vf16(const ggml_compute_params * params,
|
||||
ggml_tensor * dst,
|
||||
int ir0,
|
||||
int ir1,
|
||||
void * tcm_buffer,
|
||||
size_t tcm_buffer_size);
|
||||
|
||||
void forward_rms_norm_f32(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void forward_norm_f32(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void forward_cont_with_permute(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void forward_cpy_with_permute(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_get_rows(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_concat(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <ggml_op op_type, typename T> void forward_binary(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_sum_rows(const ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_repeat_nrows(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
template <typename T> void forward_repeat_dim1(ggml_compute_params * params, ggml_tensor * op);
|
||||
|
||||
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
void quantize_a_4row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
|
||||
|
||||
} // namespace rvv
|
||||
|
||||
} // namespace spacemit_kernels
|
||||
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
|
||||
#define SPINE_CACHE_LINE 64
|
||||
#define SPINE_CACHE_ALIGN __attribute__((aligned(SPINE_CACHE_LINE)))
|
||||
|
||||
struct spine_barrier_t {
|
||||
SPINE_CACHE_ALIGN std::atomic<int64_t> pending_;
|
||||
SPINE_CACHE_ALIGN std::atomic<int64_t> rounds_;
|
||||
SPINE_CACHE_ALIGN int64_t total_;
|
||||
};
|
||||
|
||||
inline void spine_barrier_wait(spine_barrier_t * b) {
|
||||
auto cur_round = b->rounds_.load(std::memory_order_acquire);
|
||||
auto cnt = --b->pending_;
|
||||
if (cnt == 0) {
|
||||
b->pending_.store(b->total_);
|
||||
b->rounds_.store(cur_round + 1);
|
||||
} else {
|
||||
while (cur_round == b->rounds_.load(std::memory_order_relaxed)) {
|
||||
__asm__ volatile("pause " ::: "memory");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void spine_barrier_init(spine_barrier_t * b, int num_barriers, uint64_t thread_count) {
|
||||
for (int i = 0; i < num_barriers; i++) {
|
||||
b[i].total_ = thread_count;
|
||||
b[i].pending_.store(thread_count);
|
||||
b[i].rounds_.store(0);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,760 @@
|
||||
#include "spine_mem_pool.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ime_env.h"
|
||||
#include "spine_tcm.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
namespace {
|
||||
|
||||
constexpr size_t SPINE_MEM_POOL_CHUNK_SIZE = 512ull * 1024ull * 1024ull;
|
||||
constexpr size_t SPINE_SHARE_MEM_POOL_CHUNK_SIZE = 512ull * 1024ull;
|
||||
constexpr size_t SPINE_MEM_POOL_1G_REGION_SIZE = 1ull << 30;
|
||||
constexpr uint64_t HUGETLB_1G_FLAG_REQUIRE_PUD = 1ull << 0;
|
||||
constexpr char SPINE_MEM_POOL_HUGETLB_1G_DEV[] = "/dev/hugetlb_1g";
|
||||
constexpr char SPINE_MEM_POOL_TCM_SYNC_MEM_DEV[] = "/dev/tcm_sync_mem";
|
||||
|
||||
struct hugetlb_1g_region {
|
||||
uint64_t size{ 0 };
|
||||
uint64_t dma_addr{ 0 };
|
||||
uint64_t flags{ 0 };
|
||||
uint64_t reserved{ 0 };
|
||||
};
|
||||
|
||||
#define HUGETLB_1G_IOC_MAGIC 'M'
|
||||
#define HUGETLB_1G_IOC_ALLOC _IOWR(HUGETLB_1G_IOC_MAGIC, 0x00, struct hugetlb_1g_region)
|
||||
#define HUGETLB_1G_IOC_FREE _IO(HUGETLB_1G_IOC_MAGIC, 0x01)
|
||||
|
||||
struct free_block {
|
||||
size_t offset{ 0 };
|
||||
size_t size{ 0 };
|
||||
};
|
||||
|
||||
struct pool_chunk {
|
||||
uint8_t * base{ nullptr };
|
||||
size_t size{ 0 };
|
||||
int fd{ -1 };
|
||||
std::vector<free_block> free_blocks;
|
||||
};
|
||||
|
||||
struct pool_allocation {
|
||||
void * chunk_base{ nullptr };
|
||||
size_t chunk_size{ 0 };
|
||||
void * base{ nullptr };
|
||||
size_t size{ 0 };
|
||||
};
|
||||
|
||||
bool is_power_of_two(size_t value) {
|
||||
return value != 0 && (value & (value - 1)) == 0;
|
||||
}
|
||||
|
||||
bool align_up(size_t value, size_t alignment, size_t * aligned_value) {
|
||||
if (aligned_value == nullptr || alignment == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t remainder = value % alignment;
|
||||
if (remainder == 0) {
|
||||
*aligned_value = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
const size_t padding = alignment - remainder;
|
||||
if (value > std::numeric_limits<size_t>::max() - padding) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*aligned_value = value + padding;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool align_up_uintptr(uintptr_t value, size_t alignment, uintptr_t * aligned_value) {
|
||||
if (aligned_value == nullptr || alignment == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const uintptr_t remainder = value % alignment;
|
||||
if (remainder == 0) {
|
||||
*aligned_value = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
const uintptr_t padding = alignment - remainder;
|
||||
if (value > std::numeric_limits<uintptr_t>::max() - padding) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*aligned_value = value + padding;
|
||||
return true;
|
||||
}
|
||||
|
||||
class spine_mem_pool_manager {
|
||||
public:
|
||||
explicit spine_mem_pool_manager(size_t default_chunk_size) : default_chunk_size_(default_chunk_size) {}
|
||||
|
||||
virtual ~spine_mem_pool_manager() = default;
|
||||
|
||||
void * alloc(size_t size, size_t alignment) {
|
||||
if (size == 0 || !is_power_of_two(alignment)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t aligned_size = 0;
|
||||
if (!align_up(size, alignment, &aligned_size)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: align_up failed for size %zu alignment %zu\n", __func__, size,
|
||||
alignment);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
pool_allocation allocation;
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
|
||||
if (!add_chunk_locked(aligned_size, alignment)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation retry failed for size %zu alignment %zu\n",
|
||||
__func__, aligned_size, alignment);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const auto [allocation_it, inserted] = allocations_.emplace(allocation.base, allocation);
|
||||
if (!inserted) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: duplicate allocation key %p\n", __func__, allocation.base);
|
||||
rollback_allocation_locked(allocation);
|
||||
return nullptr;
|
||||
}
|
||||
} catch (const std::bad_alloc &) {
|
||||
rollback_allocation_locked(allocation);
|
||||
throw;
|
||||
}
|
||||
|
||||
return allocation.base;
|
||||
}
|
||||
|
||||
void free(void * base) {
|
||||
if (base == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
auto allocation_it = allocations_.find(base);
|
||||
if (allocation_it == allocations_.end()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown allocation %p\n", __func__, base);
|
||||
return;
|
||||
}
|
||||
|
||||
pool_allocation allocation = allocation_it->second;
|
||||
allocations_.erase(allocation_it);
|
||||
|
||||
auto chunk_it = find_chunk_locked(allocation);
|
||||
if (chunk_it == chunks_.end()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown chunk for allocation %p size %zu\n", __func__,
|
||||
allocation.base, allocation.size);
|
||||
return;
|
||||
}
|
||||
|
||||
auto * chunk_base = chunk_it->base;
|
||||
auto * alloc_base = static_cast<uint8_t *>(allocation.base);
|
||||
if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p out of chunk range %p..%p\n", __func__,
|
||||
allocation.base, chunk_base, chunk_base + chunk_it->size);
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
|
||||
if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p size %zu exceeds chunk size %zu\n", __func__,
|
||||
allocation.base, allocation.size, chunk_it->size);
|
||||
return;
|
||||
}
|
||||
|
||||
insert_free_block_locked(*chunk_it, { offset, allocation.size });
|
||||
maybe_release_empty_chunk_locked(chunk_it);
|
||||
}
|
||||
|
||||
protected:
|
||||
void release_chunks() {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
allocations_.clear();
|
||||
for (auto & chunk : chunks_) {
|
||||
dealloc_chunk(&chunk);
|
||||
}
|
||||
chunks_.clear();
|
||||
}
|
||||
|
||||
size_t default_chunk_size() const { return default_chunk_size_; }
|
||||
|
||||
static void clear_chunk(pool_chunk * chunk) {
|
||||
chunk->base = nullptr;
|
||||
chunk->size = 0;
|
||||
chunk->fd = -1;
|
||||
chunk->free_blocks.clear();
|
||||
}
|
||||
|
||||
virtual bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) = 0;
|
||||
virtual void dealloc_chunk(pool_chunk * chunk) = 0;
|
||||
|
||||
private:
|
||||
struct alloc_candidate {
|
||||
size_t chunk_index{ 0 };
|
||||
size_t block_index{ 0 };
|
||||
size_t aligned_offset{ 0 };
|
||||
uintptr_t address{ std::numeric_limits<uintptr_t>::max() };
|
||||
bool valid{ false };
|
||||
};
|
||||
|
||||
std::vector<pool_chunk>::iterator find_chunk_locked(const pool_allocation & allocation) {
|
||||
return std::find_if(chunks_.begin(), chunks_.end(), [&](const pool_chunk & chunk) {
|
||||
return chunk.base == allocation.chunk_base && chunk.size == allocation.chunk_size;
|
||||
});
|
||||
}
|
||||
|
||||
bool add_chunk_locked(size_t min_size, size_t alignment) {
|
||||
pool_chunk chunk;
|
||||
const size_t chunk_request = default_chunk_size_ == 0 ? min_size : std::max(min_size, default_chunk_size_);
|
||||
void * hint_addr = nullptr;
|
||||
|
||||
for (const auto & existing_chunk : chunks_) {
|
||||
auto * chunk_end = existing_chunk.base + existing_chunk.size;
|
||||
if (hint_addr == nullptr || chunk_end > hint_addr) {
|
||||
hint_addr = chunk_end;
|
||||
}
|
||||
}
|
||||
|
||||
if (!alloc_chunk(chunk_request, alignment, hint_addr, &chunk)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (chunk.base == nullptr || chunk.size < min_size) {
|
||||
GGML_LOG_ERROR(
|
||||
"CPU_RISCV64_SPACEMIT: %s: invalid chunk returned for request size %zu, chunk_base=%p chunk_size=%zu\n",
|
||||
__func__, min_size, chunk.base, chunk.size);
|
||||
dealloc_chunk(&chunk);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
chunk.free_blocks.push_back({ 0, chunk.size });
|
||||
chunks_.push_back(std::move(chunk));
|
||||
} catch (const std::bad_alloc &) {
|
||||
dealloc_chunk(&chunk);
|
||||
throw;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void rollback_allocation_locked(const pool_allocation & allocation) {
|
||||
auto chunk_it = find_chunk_locked(allocation);
|
||||
if (chunk_it == chunks_.end()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, owning chunk not found\n",
|
||||
__func__, allocation.base);
|
||||
return;
|
||||
}
|
||||
|
||||
auto * chunk_base = chunk_it->base;
|
||||
auto * alloc_base = static_cast<uint8_t *>(allocation.base);
|
||||
if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, chunk range is invalid\n",
|
||||
__func__, allocation.base);
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
|
||||
if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p size %zu\n", __func__,
|
||||
allocation.base, allocation.size);
|
||||
return;
|
||||
}
|
||||
|
||||
insert_free_block_locked(*chunk_it, { offset, allocation.size });
|
||||
maybe_release_empty_chunk_locked(chunk_it);
|
||||
}
|
||||
|
||||
bool try_alloc_locked(size_t size, size_t alignment, pool_allocation * allocation) {
|
||||
alloc_candidate best;
|
||||
|
||||
for (size_t chunk_index = 0; chunk_index < chunks_.size(); ++chunk_index) {
|
||||
const auto & chunk = chunks_[chunk_index];
|
||||
for (size_t block_index = 0; block_index < chunk.free_blocks.size(); ++block_index) {
|
||||
const auto & block = chunk.free_blocks[block_index];
|
||||
|
||||
uintptr_t aligned_addr = 0;
|
||||
const auto block_addr = reinterpret_cast<uintptr_t>(chunk.base + block.offset);
|
||||
if (!align_up_uintptr(block_addr, alignment, &aligned_addr)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (aligned_addr < block_addr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t aligned_offset = block.offset + static_cast<size_t>(aligned_addr - block_addr);
|
||||
const size_t padding = aligned_offset - block.offset;
|
||||
if (padding > block.size || size > block.size - padding) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!best.valid || aligned_addr < best.address) {
|
||||
best.chunk_index = chunk_index;
|
||||
best.block_index = block_index;
|
||||
best.aligned_offset = aligned_offset;
|
||||
best.address = aligned_addr;
|
||||
best.valid = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!best.valid) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto & chunk = chunks_[best.chunk_index];
|
||||
const free_block block = chunk.free_blocks[best.block_index];
|
||||
const size_t padding = best.aligned_offset - block.offset;
|
||||
const size_t alloc_end = best.aligned_offset + size;
|
||||
const size_t block_end = block.offset + block.size;
|
||||
|
||||
chunk.free_blocks.erase(chunk.free_blocks.begin() + best.block_index);
|
||||
auto insert_it = chunk.free_blocks.begin() + best.block_index;
|
||||
if (padding != 0) {
|
||||
insert_it = chunk.free_blocks.insert(insert_it, { block.offset, padding });
|
||||
++insert_it;
|
||||
}
|
||||
if (alloc_end < block_end) {
|
||||
chunk.free_blocks.insert(insert_it, { alloc_end, block_end - alloc_end });
|
||||
}
|
||||
|
||||
allocation->chunk_base = chunk.base;
|
||||
allocation->chunk_size = chunk.size;
|
||||
allocation->base = chunk.base + best.aligned_offset;
|
||||
allocation->size = size;
|
||||
return true;
|
||||
}
|
||||
|
||||
void maybe_release_empty_chunk_locked(std::vector<pool_chunk>::iterator chunk_it) {
|
||||
if (chunk_it->free_blocks.size() != 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto & block = chunk_it->free_blocks.front();
|
||||
if (block.offset != 0 || block.size != chunk_it->size) {
|
||||
return;
|
||||
}
|
||||
|
||||
dealloc_chunk(&*chunk_it);
|
||||
chunks_.erase(chunk_it);
|
||||
}
|
||||
|
||||
void insert_free_block_locked(pool_chunk & chunk, free_block block) {
|
||||
auto it = chunk.free_blocks.begin();
|
||||
while (it != chunk.free_blocks.end() && it->offset < block.offset) {
|
||||
++it;
|
||||
}
|
||||
|
||||
if (it != chunk.free_blocks.begin()) {
|
||||
const auto & prev = *(it - 1);
|
||||
if (prev.offset + prev.size > block.offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping free block at offset %zu size %zu\n", __func__,
|
||||
block.offset, block.size);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (it != chunk.free_blocks.end() && block.offset + block.size > it->offset) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping next free block at offset %zu size %zu\n", __func__,
|
||||
block.offset, block.size);
|
||||
return;
|
||||
}
|
||||
|
||||
it = chunk.free_blocks.insert(it, block);
|
||||
|
||||
if (it != chunk.free_blocks.begin()) {
|
||||
auto prev = it - 1;
|
||||
if (prev->offset + prev->size == it->offset) {
|
||||
it->offset = prev->offset;
|
||||
it->size += prev->size;
|
||||
it = chunk.free_blocks.erase(prev);
|
||||
}
|
||||
}
|
||||
|
||||
if (it + 1 != chunk.free_blocks.end() && it->offset + it->size == (it + 1)->offset) {
|
||||
it->size += (it + 1)->size;
|
||||
chunk.free_blocks.erase(it + 1);
|
||||
}
|
||||
}
|
||||
|
||||
std::mutex mutex_;
|
||||
std::vector<pool_chunk> chunks_;
|
||||
std::unordered_map<void *, pool_allocation> allocations_;
|
||||
size_t default_chunk_size_{ 0 };
|
||||
};
|
||||
|
||||
class spine_mem_pool_posix final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_posix() : spine_mem_pool_manager(0) {}
|
||||
|
||||
~spine_mem_pool_posix() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) hint_addr;
|
||||
|
||||
const size_t alloc_alignment = std::max(alignment, sizeof(void *));
|
||||
void * base = nullptr;
|
||||
const int rc = posix_memalign(&base, alloc_alignment, min_size);
|
||||
if (rc != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: posix_memalign failed for size %zu alignment %zu, rc=%d\n",
|
||||
__func__, min_size, alloc_alignment, rc);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(base);
|
||||
chunk->size = min_size;
|
||||
chunk->fd = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
std::free(chunk->base);
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
class spine_mem_pool_transparent_hugepage final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_transparent_hugepage() : spine_mem_pool_manager(SPINE_MEM_POOL_CHUNK_SIZE) {}
|
||||
|
||||
~spine_mem_pool_transparent_hugepage() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) alignment;
|
||||
|
||||
size_t chunk_size = 0;
|
||||
if (!align_up(min_size, default_chunk_size(), &chunk_size)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round chunk size for %zu\n", __func__, min_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
void * map_addr = mmap(hint_addr, chunk_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (map_addr == MAP_FAILED) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for chunk size %zu, errno=%d\n", __func__, chunk_size,
|
||||
errno);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (madvise(map_addr, chunk_size, MADV_HUGEPAGE) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: madvise(MADV_HUGEPAGE) failed for chunk size %zu, errno=%d\n",
|
||||
__func__, chunk_size, errno);
|
||||
munmap(map_addr, chunk_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(map_addr);
|
||||
chunk->size = chunk_size;
|
||||
chunk->fd = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for chunk %p size %zu, errno=%d\n", __func__,
|
||||
chunk->base, chunk->size, errno);
|
||||
}
|
||||
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
class spine_mem_pool_hugetlb_1g final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_hugetlb_1g() : spine_mem_pool_manager(SPINE_MEM_POOL_1G_REGION_SIZE) {}
|
||||
|
||||
~spine_mem_pool_hugetlb_1g() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) alignment;
|
||||
(void) hint_addr;
|
||||
|
||||
size_t region_size = 0;
|
||||
if (!align_up(min_size, SPINE_MEM_POOL_1G_REGION_SIZE, ®ion_size)) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round hugetlb_1g size for %zu\n", __func__, min_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
const int fd = open(SPINE_MEM_POOL_HUGETLB_1G_DEV, O_RDWR);
|
||||
if (fd < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
|
||||
SPINE_MEM_POOL_HUGETLB_1G_DEV, errno);
|
||||
return false;
|
||||
}
|
||||
|
||||
hugetlb_1g_region region;
|
||||
region.size = region_size;
|
||||
region.flags = HUGETLB_1G_FLAG_REQUIRE_PUD;
|
||||
if (ioctl(fd, HUGETLB_1G_IOC_ALLOC, ®ion) < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_ALLOC failed for size %zu, errno=%d\n", __func__,
|
||||
region_size, errno);
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
void * map_addr = mmap(nullptr, region.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if (map_addr == MAP_FAILED) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for hugetlb_1g size %llu, errno=%d\n", __func__,
|
||||
static_cast<unsigned long long>(region.size), errno);
|
||||
ioctl(fd, HUGETLB_1G_IOC_FREE);
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(map_addr);
|
||||
chunk->size = region.size;
|
||||
chunk->fd = fd;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for hugetlb_1g chunk %p size %zu, errno=%d\n",
|
||||
__func__, chunk->base, chunk->size, errno);
|
||||
}
|
||||
|
||||
if (chunk->fd >= 0) {
|
||||
if (ioctl(chunk->fd, HUGETLB_1G_IOC_FREE) < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_FREE failed for chunk %p, errno=%d\n",
|
||||
__func__, chunk->base, errno);
|
||||
}
|
||||
|
||||
close(chunk->fd);
|
||||
}
|
||||
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
class spine_mem_pool_shared_mem final : public spine_mem_pool_manager {
|
||||
public:
|
||||
spine_mem_pool_shared_mem() : spine_mem_pool_manager(SPINE_SHARE_MEM_POOL_CHUNK_SIZE) {}
|
||||
|
||||
~spine_mem_pool_shared_mem() override { release_chunks(); }
|
||||
|
||||
private:
|
||||
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
|
||||
(void) alignment;
|
||||
|
||||
if (hint_addr != nullptr) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem does not support multiple active chunks\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (min_size > default_chunk_size()) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem request %zu exceeds chunk size %zu\n", __func__,
|
||||
min_size, default_chunk_size());
|
||||
return false;
|
||||
}
|
||||
|
||||
const int fd = open(SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, O_RDWR | O_SYNC);
|
||||
if (fd < 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
|
||||
SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, errno);
|
||||
return false;
|
||||
}
|
||||
|
||||
void * map_addr = mmap(nullptr, default_chunk_size(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if (map_addr == MAP_FAILED) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for %s size %zu, errno=%d\n", __func__,
|
||||
SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, default_chunk_size(), errno);
|
||||
close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk->base = static_cast<uint8_t *>(map_addr);
|
||||
chunk->size = default_chunk_size();
|
||||
chunk->fd = fd;
|
||||
return true;
|
||||
}
|
||||
|
||||
void dealloc_chunk(pool_chunk * chunk) override {
|
||||
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for shared_mem chunk %p size %zu, errno=%d\n",
|
||||
__func__, chunk->base, chunk->size, errno);
|
||||
}
|
||||
|
||||
if (chunk->fd >= 0) {
|
||||
close(chunk->fd);
|
||||
}
|
||||
|
||||
clear_chunk(chunk);
|
||||
}
|
||||
};
|
||||
|
||||
spine_mem_pool_manager & get_spine_mem_pool_manager() {
|
||||
static std::once_flag pool_once;
|
||||
static std::unique_ptr<spine_mem_pool_manager> selected_pool;
|
||||
static spine_mem_pool_backend selected_backend = spine_mem_pool_backend::none;
|
||||
|
||||
spine_mem_pool_backend backend = global_spine_env_info.mem_backend;
|
||||
if (backend == spine_mem_pool_backend::none) {
|
||||
backend = spine_mem_pool_backend::transparent_hugepage;
|
||||
}
|
||||
|
||||
std::call_once(pool_once, [&]() {
|
||||
selected_backend = backend;
|
||||
|
||||
switch (selected_backend) {
|
||||
case spine_mem_pool_backend::posix_memalign:
|
||||
selected_pool = std::make_unique<spine_mem_pool_posix>();
|
||||
break;
|
||||
case spine_mem_pool_backend::transparent_hugepage:
|
||||
selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
|
||||
break;
|
||||
case spine_mem_pool_backend::hugetlb_1g:
|
||||
selected_pool = std::make_unique<spine_mem_pool_hugetlb_1g>();
|
||||
break;
|
||||
case spine_mem_pool_backend::none:
|
||||
selected_backend = spine_mem_pool_backend::transparent_hugepage;
|
||||
selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
if (backend != selected_backend) {
|
||||
GGML_LOG_ERROR(
|
||||
"CPU_RISCV64_SPACEMIT: %s: mem pool backend is process-global and mutually exclusive, requested=%d but "
|
||||
"selected=%d\n",
|
||||
__func__, static_cast<int>(backend), static_cast<int>(selected_backend));
|
||||
}
|
||||
|
||||
if (selected_pool) {
|
||||
return *selected_pool;
|
||||
}
|
||||
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
spine_mem_pool_manager & get_spine_mem_pool_shared_mem_manager() {
|
||||
static std::once_flag shared_mem_pool_once;
|
||||
static std::unique_ptr<spine_mem_pool_shared_mem> shared_mem_pool;
|
||||
|
||||
std::call_once(shared_mem_pool_once, [&]() { shared_mem_pool = std::make_unique<spine_mem_pool_shared_mem>(); });
|
||||
|
||||
if (shared_mem_pool) {
|
||||
return *shared_mem_pool;
|
||||
}
|
||||
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept {
|
||||
if (info == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*info = {};
|
||||
|
||||
if (spine_tcm_open_handle(NULL) != 0 || !spine_tcm_is_available()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
spine_tcm_mem_info_t mem_info;
|
||||
if (spine_tcm_mem_info(&mem_info) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
info->available = true;
|
||||
info->blk_size = mem_info.blk_size;
|
||||
info->blk_num = mem_info.blk_num;
|
||||
info->is_fake_tcm = mem_info.is_fake_tcm != 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept {
|
||||
return spine_tcm_mem_get(cpu_id);
|
||||
}
|
||||
|
||||
void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept {
|
||||
return spine_tcm_mem_try_wait(cpu_id, 1000 * 1000);
|
||||
}
|
||||
|
||||
int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept {
|
||||
return spine_tcm_mem_release(cpu_id);
|
||||
}
|
||||
|
||||
void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept {
|
||||
try {
|
||||
return get_spine_mem_pool_manager().alloc(size, alignment);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating size %zu\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept {
|
||||
try {
|
||||
return get_spine_mem_pool_shared_mem_manager().alloc(size, alignment);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating shared memory size %zu\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void spine_mem_pool_free(void * base) noexcept {
|
||||
try {
|
||||
get_spine_mem_pool_manager().free(base);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing allocation %p\n", __func__, base);
|
||||
}
|
||||
}
|
||||
|
||||
void spine_mem_pool_shared_mem_free(void * base) noexcept {
|
||||
try {
|
||||
get_spine_mem_pool_shared_mem_manager().free(base);
|
||||
} catch (const std::bad_alloc &) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing shared allocation %p\n", __func__, base);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
|
||||
extern "C" {
|
||||
void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment) {
|
||||
void * result = ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_alloc(size, alignment);
|
||||
if (result == nullptr) {
|
||||
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to allocate shared memory size %zu alignment %zu\n", __func__,
|
||||
size, alignment);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr) {
|
||||
ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_free(ptr);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace ggml::cpu::riscv64_spacemit {
|
||||
|
||||
enum class spine_mem_pool_backend : uint8_t {
|
||||
none,
|
||||
posix_memalign,
|
||||
transparent_hugepage,
|
||||
hugetlb_1g,
|
||||
};
|
||||
|
||||
struct spine_mem_pool_tcm_info {
|
||||
bool available{ false };
|
||||
size_t blk_size{ 0 };
|
||||
size_t blk_num{ 0 };
|
||||
bool is_fake_tcm{ false };
|
||||
};
|
||||
|
||||
bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept;
|
||||
void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept;
|
||||
void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept;
|
||||
int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept;
|
||||
|
||||
void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept;
|
||||
void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept;
|
||||
void spine_mem_pool_free(void * base) noexcept;
|
||||
void spine_mem_pool_shared_mem_free(void * base) noexcept;
|
||||
|
||||
} // namespace ggml::cpu::riscv64_spacemit
|
||||
@@ -0,0 +1,409 @@
|
||||
#ifndef SPINE_TCM_PUBLIC_H_
|
||||
#define SPINE_TCM_PUBLIC_H_
|
||||
|
||||
/*
|
||||
* spine_tcm public API
|
||||
*
|
||||
* Usage:
|
||||
* 1. Direct link mode
|
||||
* Define SPINE_TCM_DIRECT_LINK and link against libspine_tcm.so.
|
||||
*
|
||||
* if (spine_tcm_is_available()) {
|
||||
* void *buffer = spine_tcm_mem_get(0);
|
||||
* spine_tcm_mem_free(0);
|
||||
* }
|
||||
*
|
||||
* 2. Header-only loader mode
|
||||
* Include this header without linking libspine_tcm.so. The loader first
|
||||
* tries to reuse a process-global spine_tcm instance and falls back to
|
||||
* dlopen("libspine_tcm.so") when needed.
|
||||
*
|
||||
* spine_tcm_open_handle(NULL); // optional pre-bind
|
||||
* if (spine_tcm_is_available()) {
|
||||
* void *buffer = spine_tcm_mem_get(0);
|
||||
* spine_tcm_mem_free(0);
|
||||
* }
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if !defined(SPINE_TCM_BUILD_SHARED) && !defined(SPINE_TCM_DIRECT_LINK)
|
||||
# include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
# if defined(SPINE_TCM_BUILD_SHARED)
|
||||
# define SPINE_TCM_API __declspec(dllexport)
|
||||
# else
|
||||
# define SPINE_TCM_API __declspec(dllimport)
|
||||
# endif
|
||||
#else
|
||||
# define SPINE_TCM_API __attribute__((visibility("default")))
|
||||
#endif
|
||||
|
||||
typedef struct spine_tcm_mem_info {
|
||||
size_t blk_size;
|
||||
size_t blk_num;
|
||||
int is_fake_tcm;
|
||||
} spine_tcm_mem_info_t;
|
||||
|
||||
typedef struct spine_tcm_block_info {
|
||||
int id;
|
||||
void * va;
|
||||
size_t size;
|
||||
uint64_t phys_addr;
|
||||
uint64_t cpu_affinity_mask;
|
||||
int owner_tid;
|
||||
int is_acquired;
|
||||
} spine_tcm_block_info_t;
|
||||
|
||||
/* Shared-library runtime ABI exported by libspine_tcm.so. */
|
||||
SPINE_TCM_API const char * spine_tcm_runtime_version(void);
|
||||
SPINE_TCM_API int spine_tcm_runtime_is_available(void);
|
||||
SPINE_TCM_API int spine_tcm_runtime_layout_info(spine_tcm_mem_info_t * info);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_info(int id, spine_tcm_block_info_t * info);
|
||||
SPINE_TCM_API void * spine_tcm_runtime_mem_get(int id);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_free(int id);
|
||||
SPINE_TCM_API void * spine_tcm_runtime_mem_try_wait(int id, size_t timeout_us);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_release(int id);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_force_release(int id);
|
||||
SPINE_TCM_API int spine_tcm_runtime_mem_query(int id);
|
||||
|
||||
#if defined(SPINE_TCM_DIRECT_LINK)
|
||||
/* Optional no-op in direct-link mode. */
|
||||
static inline int spine_tcm_open_handle(const char * so_path) {
|
||||
(void) so_path;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline const char * spine_tcm_version(void) {
|
||||
return spine_tcm_runtime_version();
|
||||
}
|
||||
|
||||
/* Returns 1 when the runtime driver is available, otherwise 0. */
|
||||
static inline int spine_tcm_is_available(void) {
|
||||
return spine_tcm_runtime_is_available();
|
||||
}
|
||||
|
||||
/* Returns runtime memory geometry and whether the current backend is fake TCM. */
|
||||
static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
|
||||
return spine_tcm_runtime_layout_info(info);
|
||||
}
|
||||
|
||||
/* Returns per-block runtime metadata for the given TCM id. */
|
||||
static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
|
||||
return spine_tcm_runtime_mem_info(id, info);
|
||||
}
|
||||
|
||||
/* Returns a cached buffer for the given TCM id, or NULL on failure. */
|
||||
static inline void * spine_tcm_mem_get(int id) {
|
||||
return spine_tcm_runtime_mem_get(id);
|
||||
}
|
||||
|
||||
/* Releases one reference acquired by spine_tcm_mem_get(id). */
|
||||
static inline int spine_tcm_mem_free(int id) {
|
||||
return spine_tcm_runtime_mem_free(id);
|
||||
}
|
||||
|
||||
/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
|
||||
static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
|
||||
return spine_tcm_runtime_mem_try_wait(id, over_time);
|
||||
}
|
||||
|
||||
/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
|
||||
static inline int spine_tcm_mem_release(int id) {
|
||||
return spine_tcm_runtime_mem_release(id);
|
||||
}
|
||||
|
||||
/* Forces a release for the given TCM id when the backend supports it. */
|
||||
static inline int spine_tcm_mem_force_release(int id) {
|
||||
return spine_tcm_runtime_mem_force_release(id);
|
||||
}
|
||||
|
||||
/* Returns whether the given TCM id is currently acquired. */
|
||||
static inline int spine_tcm_mem_query(int id) {
|
||||
return spine_tcm_runtime_mem_query(id);
|
||||
}
|
||||
#elif !defined(SPINE_TCM_BUILD_SHARED)
|
||||
typedef struct spine_tcm_handle {
|
||||
void * module_handle;
|
||||
int use_global_scope;
|
||||
int owns_module_handle;
|
||||
const char * (*runtime_version)(void);
|
||||
int (*runtime_is_available)(void);
|
||||
int (*runtime_layout_info)(spine_tcm_mem_info_t * info);
|
||||
int (*runtime_mem_info)(int id, spine_tcm_block_info_t * info);
|
||||
void * (*runtime_mem_get)(int id);
|
||||
int (*runtime_mem_free)(int id);
|
||||
void * (*runtime_mem_try_wait)(int id, size_t over_time);
|
||||
int (*runtime_mem_release)(int id);
|
||||
int (*runtime_mem_force_release)(int id);
|
||||
int (*runtime_mem_query)(int id);
|
||||
} spine_tcm_handle_t;
|
||||
|
||||
static inline spine_tcm_handle_t * spine_tcm_default_handle(void) {
|
||||
static spine_tcm_handle_t handle = { 0 };
|
||||
return &handle;
|
||||
}
|
||||
|
||||
static inline void spine_tcm_handle_reset(spine_tcm_handle_t * handle) {
|
||||
if (handle != NULL) {
|
||||
memset(handle, 0, sizeof(*handle));
|
||||
}
|
||||
}
|
||||
|
||||
static inline int spine_tcm_handle_bind(spine_tcm_handle_t * handle) {
|
||||
void * symbol_scope = handle->use_global_scope ? RTLD_DEFAULT : handle->module_handle;
|
||||
|
||||
handle->runtime_version = (const char * (*) (void) ) dlsym(symbol_scope, "spine_tcm_runtime_version");
|
||||
handle->runtime_is_available = (int (*)(void)) dlsym(symbol_scope, "spine_tcm_runtime_is_available");
|
||||
handle->runtime_layout_info =
|
||||
(int (*)(spine_tcm_mem_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_layout_info");
|
||||
handle->runtime_mem_info =
|
||||
(int (*)(int, spine_tcm_block_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_mem_info");
|
||||
handle->runtime_mem_get = (void * (*) (int) ) dlsym(symbol_scope, "spine_tcm_runtime_mem_get");
|
||||
handle->runtime_mem_free = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_free");
|
||||
handle->runtime_mem_try_wait = (void * (*) (int, size_t)) dlsym(symbol_scope, "spine_tcm_runtime_mem_try_wait");
|
||||
handle->runtime_mem_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_release");
|
||||
handle->runtime_mem_force_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_force_release");
|
||||
handle->runtime_mem_query = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_query");
|
||||
|
||||
return handle->runtime_version != NULL && handle->runtime_is_available != NULL &&
|
||||
handle->runtime_layout_info != NULL && handle->runtime_mem_info != NULL &&
|
||||
handle->runtime_mem_get != NULL && handle->runtime_mem_free != NULL &&
|
||||
handle->runtime_mem_try_wait != NULL && handle->runtime_mem_release != NULL &&
|
||||
handle->runtime_mem_force_release != NULL && handle->runtime_mem_query != NULL ?
|
||||
0 :
|
||||
-1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to bind against an already-loaded process-global spine_tcm instance.
|
||||
* The shared library exports spine_tcm_runtime_marker only for this probe.
|
||||
*/
|
||||
static inline int spine_tcm_try_bind_global(spine_tcm_handle_t * handle) {
|
||||
if (dlsym(RTLD_DEFAULT, "spine_tcm_runtime_marker") == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
handle->use_global_scope = 1;
|
||||
return spine_tcm_handle_bind(handle);
|
||||
}
|
||||
|
||||
/*
|
||||
* Optional pre-bind entry point.
|
||||
*
|
||||
* Behavior:
|
||||
* - Reuses an already-loaded global spine_tcm instance when available.
|
||||
* - Otherwise loads the shared library from so_path or the default soname.
|
||||
* - Repeated calls are safe and return 0 after the first successful bind.
|
||||
*/
|
||||
static inline int spine_tcm_open_handle(const char * so_path) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
const char * library = (so_path != NULL && so_path[0] != '\0') ? so_path : "libspine_tcm.so";
|
||||
|
||||
if (resolved->module_handle != NULL || resolved->use_global_scope) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (spine_tcm_try_bind_global(resolved) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
spine_tcm_handle_reset(resolved);
|
||||
|
||||
resolved->module_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL);
|
||||
resolved->owns_module_handle = resolved->module_handle != NULL ? 1 : 0;
|
||||
|
||||
if (resolved->module_handle == NULL) {
|
||||
spine_tcm_handle_reset(resolved);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (spine_tcm_handle_bind(resolved) != 0) {
|
||||
if (resolved->owns_module_handle) {
|
||||
dlclose(resolved->module_handle);
|
||||
}
|
||||
spine_tcm_handle_reset(resolved);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns 1 when the runtime driver is available, otherwise 0. */
|
||||
static inline int spine_tcm_is_available(void) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_is_available == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return resolved->runtime_is_available();
|
||||
}
|
||||
|
||||
/* Returns runtime memory geometry and whether the current backend is fake TCM. */
|
||||
static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_layout_info == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_layout_info(info);
|
||||
}
|
||||
|
||||
static inline const char * spine_tcm_version(void) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_version == NULL) {
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
return resolved->runtime_version();
|
||||
}
|
||||
|
||||
/* Returns per-block runtime metadata for the given TCM id. */
|
||||
static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_info == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_info(id, info);
|
||||
}
|
||||
|
||||
/* Returns a cached buffer for the given TCM id, or NULL on failure. */
|
||||
static inline void * spine_tcm_mem_get(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (resolved->runtime_mem_get == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_get(id);
|
||||
}
|
||||
|
||||
/* Releases one reference acquired by spine_tcm_mem_get(id). */
|
||||
static inline int spine_tcm_mem_free(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_free == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_free(id);
|
||||
}
|
||||
|
||||
/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
|
||||
static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (resolved->runtime_mem_try_wait == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_try_wait(id, over_time);
|
||||
}
|
||||
|
||||
/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
|
||||
static inline int spine_tcm_mem_release(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_release == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_release(id);
|
||||
}
|
||||
|
||||
/* Forces a release for the given TCM id when the backend supports it. */
|
||||
static inline int spine_tcm_mem_force_release(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) ||
|
||||
resolved->runtime_mem_force_release == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_force_release(id);
|
||||
}
|
||||
|
||||
/* Returns whether the given TCM id is currently acquired. */
|
||||
static inline int spine_tcm_mem_query(int id) {
|
||||
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
|
||||
|
||||
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
|
||||
(void) spine_tcm_open_handle(NULL);
|
||||
}
|
||||
|
||||
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_query == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return resolved->runtime_mem_query(id);
|
||||
}
|
||||
#else
|
||||
static inline const char * spine_tcm_version(void) {
|
||||
return spine_tcm_runtime_version();
|
||||
}
|
||||
#endif
|
||||
|
||||
#define SPINE_TCM_VERSION (spine_tcm_version())
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user