ggml-cpu: Add IME2 Instruction Support for the SpacemiT Backend (#22863)

This commit is contained in:
alex-spacemit
2026-05-14 17:39:30 +08:00
committed by GitHub
parent 0f45f1a35c
commit 81b0d882ae
21 changed files with 14732 additions and 3477 deletions
+3 -2
View File
@@ -301,16 +301,17 @@ jobs:
export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
cmake -B build -DLLAMA_OPENSSL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DGGML_CPU_REPACK=OFF \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_CPU_RISCV64_SPACEMIT=ON \
-DGGML_RVV=ON \
-DGGML_RV_ZVFH=ON \
-DGGML_RV_ZFH=ON \
-DGGML_RV_ZICBOP=ON \
-DGGML_RV_ZIHINTPAUSE=ON \
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-DGGML_RV_ZBA=ON \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
cmake --build build --config Release -j $(nproc)
+2 -2
View File
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
+51 -31
View File
@@ -9,18 +9,20 @@ wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_6
~~~
2. Build
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1` and `RISCV64_SPACEMIT_IME2`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
```bash
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_CPU_RISCV64_SPACEMIT=ON \
-DGGML_CPU_REPACK=OFF \
-DLLAMA_OPENSSL=OFF \
-DGGML_RVV=ON \
-DGGML_RV_ZVFH=ON \
-DGGML_RV_ZFH=ON \
-DGGML_RV_ZICBOP=ON \
-DGGML_RV_ZIHINTPAUSE=ON \
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-DGGML_RV_ZBA=ON \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
-DCMAKE_INSTALL_PREFIX=build/installed
@@ -47,8 +49,25 @@ export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
~~~
## Quantization Support For Matrix
| Quantization Type | X60 | A100 |
| ---: | ---: | ---: |
| Q2_K | | :heavy_check_mark: |
| Q3_K | | :heavy_check_mark: |
| Q4_0 | :heavy_check_mark: | :heavy_check_mark: |
| Q4_1 | :heavy_check_mark: | :heavy_check_mark: |
| Q4_K | :heavy_check_mark: | :heavy_check_mark: |
| Q5_0 | | :heavy_check_mark: |
| Q5_1 | | :heavy_check_mark: |
| Q5_K | | :heavy_check_mark: |
| Q6_K | | :heavy_check_mark: |
| Q8_0 | | :heavy_check_mark: |
## Performance
#### Quantization Support For Matrix
* Spacemit(R) X60
~~~
model name : Spacemit(R) X60
isa : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
@@ -58,33 +77,34 @@ mvendorid : 0x710
marchid : 0x8000000058000001
~~~
Q4_0
| Model | Size | Params | backend | threads | test | t/s |
| -----------| -------- | ------ | ------- | ------- | ---- |------|
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | pp512|64.12 ± 0.26|
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | tg128|10.03 ± 0.01|
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | pp512|24.16 ± 0.02|
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | tg128|3.83 ± 0.06|
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | pp512|12.08 ± 0.02|
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | tg128|2.23 ± 0.02|
Q4_1
| Model | Size | Params | backend | threads | test | t/s |
| -----------| -------- | ------ | ------- | ------- | ---- |------|
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | pp512|62.07 ± 0.12|
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | tg128|9.91 ± 0.01|
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | pp512|22.95 ± 0.25|
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | tg128|4.01 ± 0.15|
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | pp512|11.55 ± 0.16|
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | tg128|2.25 ± 0.04|
| model | size | params | backend | threads | n_ubatch | fa | mmap | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 4 | 128 | 1 | 0 | pp128 | 10.32 ± 0.02 |
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 4 | 128 | 1 | 0 | tg128 | 3.07 ± 0.01 |
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 4 | 128 | 1 | 0 | pp128 | 49.15 ± 0.25 |
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 4 | 128 | 1 | 0 | tg128 | 11.73 ± 0.02 |
Q4_K
| Model | Size | Params | backend | threads | test | t/s |
| -----------| -------- | ------ | ------- | ------- | ---- |------|
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | pp512|9.29 ± 0.05|
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | tg128|5.67 ± 0.04|
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | pp512|10.38 ± 0.10|
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | tg128|3.17 ± 0.08|
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | pp512|4.23 ± 0.04|
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | tg128|1.73 ± 0.00|
* Spacemit(R) A100
~~~
model name : Spacemit(R) A100
isa : rv64imafdcvh_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
mmu : sv39
mvendorid : 0x710
marchid : 0x8000000041000002
mimpid : 0x10000000d5686200
hart isa : rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
~~~
| model | size | params | backend | threads | n_ubatch | fa | mmap | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 8 | 128 | 1 | 0 | pp128 | 565.83 ± 0.31 |
| qwen3 0.6B Q4_0 | 358.78 MiB | 596.05 M | CPU | 8 | 128 | 1 | 0 | tg128 | 55.77 ± 0.02 |
| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | CPU | 8 | 128 | 1 | 0 | pp128 | 79.74 ± 0.04 |
| qwen3 4B Q4_0 | 2.21 GiB | 4.02 B | CPU | 8 | 128 | 1 | 0 | tg128 | 11.29 ± 0.00 |
| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CPU | 8 | 128 | 1 | 0 | pp128 | 57.88 ± 0.31 |
| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CPU | 8 | 128 | 1 | 0 | tg128 | 12.79 ± 0.00 |
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 8 | 128 | 1 | 0 | pp128 | 115.23 ± 0.04 |
| qwen35 2B Q4_1 | 1.19 GiB | 1.88 B | CPU | 8 | 128 | 1 | 0 | tg128 | 16.49 ± 0.01 |
| gemma4 E4B Q4_K - Medium | 4.76 GiB | 7.52 B | CPU | 8 | 128 | 1 | 0 | pp128 | 21.13 ± 0.01 |
| gemma4 E4B Q4_K - Medium | 4.76 GiB | 7.52 B | CPU | 8 | 128 | 1 | 0 | tg128 | 5.66 ± 0.00 |
+13
View File
@@ -450,12 +450,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/arch/riscv/repack.cpp
)
if (GGML_CPU_RISCV64_SPACEMIT)
include(ggml-cpu/cmake/FindSMTIME.cmake)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
list(APPEND GGML_CPU_SOURCES
ggml-cpu/spacemit/ime.cpp
ggml-cpu/spacemit/ime.h
ggml-cpu/spacemit/spine_mem_pool.cpp
ggml-cpu/spacemit/spine_mem_pool.h
ggml-cpu/spacemit/repack.cpp
ggml-cpu/spacemit/repack.h
ggml-cpu/spacemit/ime_env.cpp
ggml-cpu/spacemit/ime_env.h
ggml-cpu/spacemit/ime1_kernels.cpp
ggml-cpu/spacemit/ime2_kernels.cpp
ggml-cpu/spacemit/ime_kernels.h
ggml-cpu/spacemit/rvv_kernels.cpp
ggml-cpu/spacemit/rvv_kernels.h
)
endif()
if(NOT GGML_CPU_ALL_VARIANTS)
@@ -485,6 +495,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_RV_ZIHINTPAUSE)
string(APPEND MARCH_STR "_zihintpause")
endif()
if (GGML_RV_ZBA)
string(APPEND MARCH_STR "_zba")
endif()
if (GGML_CPU_RISCV64_SPACEMIT)
# `xsmtvdotii' is only required for GCC >= 15.
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+32
View File
@@ -0,0 +1,32 @@
include(CheckCSourceRuns)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)" AND GGML_CPU_RISCV64_SPACEMIT)
set(SMT_MARCH_STR "-march=rv64gcv_zfh_zvfh_zba_zicbop")
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
string(APPEND SMT_MARCH_STR "_xsmtvdotii")
endif()
set(CMAKE_REQUIRED_FLAGS "${SMT_MARCH_STR}")
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4)
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S8)
check_c_source_compiles("int main() {__asm__ volatile(\"vfwmadot v2, v0, v1, fp16\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFWMADOT_FP16)
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S4)
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S8)
check_c_source_compiles("int main() {__asm__ volatile(\"vmadot1 v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOTN)
check_c_source_compiles("int main() {__asm__ volatile(\"vpack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK)
check_c_source_compiles("int main() {__asm__ volatile(\"vnspack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
unset(CMAKE_REQUIRED_FLAGS)
list(APPEND RISCV64_SPACEMIT_IME_SPEC "")
if (SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
set(RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME1")
endif()
if (SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4 AND SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK AND SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
list(APPEND RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME2")
endif()
message("RISCV64_SPACEMIT_IME_SPEC: ${RISCV64_SPACEMIT_IME_SPEC}")
endif()
+12
View File
@@ -50,6 +50,10 @@
#include "llamafile/sgemm.h"
#endif
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
# include "spacemit/ime.h"
#endif
// Note: once we move threading into a separate C++ file
// will use std::hardware_destructive_interference_size instead of hardcoding it here
// and we'll use C++ attribute syntax.
@@ -3011,7 +3015,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
const struct ggml_cgraph * cgraph = tp->cgraph;
const struct ggml_cplan * cplan = tp->cplan;
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(state->ith);
#else
set_numa_thread_affinity(state->ith);
#endif
struct ggml_compute_params params = {
/*.ith =*/ state->ith,
@@ -3068,6 +3076,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_barrier(state->threadpool);
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(state->ith);
#endif
return 0;
}
File diff suppressed because it is too large Load Diff
+8
View File
@@ -8,6 +8,14 @@ extern "C" {
ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);
void ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(int thread_n);
void ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(int thread_n);
void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment);
void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr);
#ifdef __cplusplus
}
#endif
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+320
View File
@@ -0,0 +1,320 @@
#include "ime_env.h"
#include "ggml-impl.h"
#include "spine_mem_pool.h"
#include <fcntl.h>
#include <unistd.h>
#include <algorithm>
#include <array>
#include <cctype>
#include <fstream>
#include <string>
#include <thread>
#include <unordered_map>
namespace ggml::cpu::riscv64_spacemit {
bool spine_core_info::get_spine_core_info(std::vector<spine_core_info> & result) {
static std::unordered_map<uint64_t, spine_core_arch_id> spine_march_mapping_ = {
{0x8000000058000001, spine_core_arch_id::core_arch_x60 },
{ 0x8000000041000001, spine_core_arch_id::core_arch_a60 },
{ 0x8000000058000002, spine_core_arch_id::core_arch_x100},
{ 0x8000000041000002, spine_core_arch_id::core_arch_a100},
};
result.clear();
std::ifstream file("/proc/cpuinfo");
std::string line;
std::vector<std::array<uint64_t, 2>> cpu_info_list;
uint64_t current_processor = spine_invalid_core_id;
uint64_t current_marchid = 0;
bool has_processor = false;
bool has_marchid = false;
if (!file.is_open()) {
return false;
}
while (std::getline(file, line)) {
if (line.substr(0, 9) == "processor") {
if (has_processor && has_marchid) {
cpu_info_list.push_back({ current_processor, current_marchid });
}
size_t colon_pos = line.find(':');
if (colon_pos != std::string::npos) {
current_processor = std::stoi(line.substr(colon_pos + 1));
has_processor = true;
}
has_marchid = false;
} else if (line.substr(0, 7) == "marchid") {
size_t colon_pos = line.find(':');
if (colon_pos != std::string::npos) {
std::string marchid_str = line.substr(colon_pos + 1);
marchid_str.erase(std::remove_if(marchid_str.begin(), marchid_str.end(), isspace), marchid_str.end());
current_marchid = std::stoull(marchid_str, nullptr, 16);
has_marchid = true;
}
}
}
if (has_processor && has_marchid) {
cpu_info_list.push_back({ current_processor, current_marchid });
}
if (has_processor && has_marchid) {
for (auto & cpu_info : cpu_info_list) {
if (cpu_info[0] != spine_invalid_core_id &&
spine_march_mapping_.find(cpu_info[1]) != spine_march_mapping_.end()) {
auto core_info = spine_core_info();
core_info.core_id = cpu_info[0];
core_info.arch_id = spine_core_arch_id(spine_march_mapping_[cpu_info[1]]);
result.push_back(core_info);
}
}
}
return has_processor && has_marchid;
}
namespace {
uint16_t hex_string_to_u16(const std::string & hex_str) {
try {
size_t pos = 0;
if (hex_str.substr(0, 2) == "0x" || hex_str.substr(0, 2) == "0X") {
pos = 2;
}
unsigned long result = std::stoul(hex_str.substr(pos), nullptr, 16);
if (result > std::numeric_limits<uint16_t>::max()) {
throw std::out_of_range("Converted value is out of range for uint16_t");
}
return static_cast<uint16_t>(result);
} catch (const std::invalid_argument & e) {
throw std::invalid_argument("Invalid hexadecimal string");
} catch (const std::out_of_range & e) {
throw;
}
}
const char * spine_mem_pool_backend_to_string(spine_mem_pool_backend backend) {
switch (backend) {
case spine_mem_pool_backend::none:
return "NONE";
case spine_mem_pool_backend::posix_memalign:
return "POSIX";
case spine_mem_pool_backend::transparent_hugepage:
return "HPAGE";
case spine_mem_pool_backend::hugetlb_1g:
return "HPAGE1GB";
}
return "unknown";
}
spine_mem_pool_backend parse_mem_backend(const char * mem_backend_str) {
if (mem_backend_str == nullptr || mem_backend_str[0] == '\0') {
return spine_mem_pool_backend::transparent_hugepage;
}
std::string value(mem_backend_str);
std::transform(value.begin(), value.end(), value.begin(),
[](unsigned char ch) { return static_cast<char>(std::tolower(ch)); });
if (value == "none") {
return spine_mem_pool_backend::none;
}
if (value == "posix") {
return spine_mem_pool_backend::posix_memalign;
}
if (value == "hpage") {
return spine_mem_pool_backend::transparent_hugepage;
}
if (value == "hpage1gb") {
return spine_mem_pool_backend::hugetlb_1g;
}
throw std::runtime_error("invalid SPACEMIT_MEM_BACKEND: " + value + ", expected NONE, POSIX, HPAGE or HPAGE1GB");
}
} // namespace
spine_env_info::spine_env_info() {
num_cores = static_cast<int>(std::thread::hardware_concurrency());
spine_core_info::get_spine_core_info(core_info_list);
// special for x60 K1
if (core_info_list.size() == 8 && core_info_list[0].arch_id == spine_core_arch_id::core_arch_x60) {
for (int i = 0; i < 4; i++) {
core_info_list[i].arch_id = spine_core_arch_id::core_arch_a60;
}
}
// special for qemu
if (core_info_list.size() == 0) {
char * spine_core_arch_str = getenv("SPACEMIT_CORE_ARCH");
if (spine_core_arch_str != nullptr) {
auto arch_id = hex_string_to_u16(spine_core_arch_str);
for (int i = 0; i < num_cores; i++) {
auto core_info = spine_core_info();
core_info.core_id = i;
core_info.arch_id = spine_core_arch_id{ arch_id };
core_info_list.push_back(core_info);
}
}
}
if (core_info_list.size() == 0) {
throw std::runtime_error(
"Failed to get SPACEMIT_CORE_ARCH from environment or failed to parse it from /proc/cpuinfo");
}
char * spine_perfer_core_arch_str = getenv("SPACEMIT_PERFER_CORE_ARCH");
if (spine_perfer_core_arch_str != nullptr && spine_perfer_core_arch_str != "") {
perfer_core_arch_id = spine_core_arch_id{ hex_string_to_u16(spine_perfer_core_arch_str) };
}
char * spine_perfer_core_id_str = getenv("SPACEMIT_PERFER_CORE_ID");
std::vector<int> perfer_core_id_vec;
if (spine_perfer_core_id_str != nullptr && spine_perfer_core_id_str != "") {
std::string perfer_core_id_str(spine_perfer_core_id_str);
size_t start = 0;
size_t end = 0;
while ((end = perfer_core_id_str.find(',', start)) != std::string::npos) {
std::string core_id_substr = perfer_core_id_str.substr(start, end - start);
perfer_core_id_vec.push_back(std::stoi(core_id_substr));
start = end + 1;
}
std::string core_id_substr = perfer_core_id_str.substr(start);
perfer_core_id_vec.push_back(std::stoi(core_id_substr));
}
perfer_core_ids.reserve(num_cores);
if (perfer_core_arch_id == spine_core_arch_id::core_arch_none) {
for (auto & core_info : core_info_list) {
auto core_arch_id = core_info.arch_id;
auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
if (core_arch_head == 0xA) {
num_perfer_cores++;
perfer_core_arch_id = core_arch_id;
cpu_mask |= (1ULL << core_info.core_id);
perfer_core_ids.push_back(core_info.core_id);
}
}
} else {
for (auto & core_info : core_info_list) {
auto core_arch_id = core_info.arch_id;
if (core_arch_id == perfer_core_arch_id) {
num_perfer_cores++;
cpu_mask |= (1ULL << core_info.core_id);
auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
if (core_arch_head == 0xA) {
perfer_core_ids.push_back(core_info.core_id);
}
}
}
if (num_perfer_cores == 0) {
GGML_ABORT("can not find core with arch id %x for SPACEMIT_PERFER_CORE_ARCH in core info list\n",
(uint16_t) perfer_core_arch_id);
}
}
if (perfer_core_id_vec.size() > 0) {
perfer_core_ids.clear();
cpu_mask = 0;
num_perfer_cores = 0;
for (int core_id : perfer_core_id_vec) {
if (core_id < 0 || core_id >= num_cores) {
GGML_ABORT("invalid core id in SPACEMIT_PERFER_CORE_ID: %d, should be between 0 and %d\n", core_id,
num_cores - 1);
}
auto core_info = core_info_list[core_id];
auto core_arch_id = core_info.arch_id;
if (core_arch_id == perfer_core_arch_id) {
cpu_mask |= (1ULL << core_id);
perfer_core_ids.push_back(core_id);
} else {
GGML_ABORT(
"core id %d in SPACEMIT_PERFER_CORE_ID has arch id %x which does not match "
"SPACEMIT_PERFER_CORE_ARCH %x\n",
core_id, (uint16_t) core_arch_id, (uint16_t) perfer_core_arch_id);
}
}
std::string perfer_core_id_vec_str;
for (int core_id : perfer_core_id_vec) {
perfer_core_id_vec_str += std::to_string(core_id) + ",";
}
perfer_core_id_vec_str.pop_back();
GGML_LOG_DEBUG("SPACEMIT_PERFER_CORE_ID is set, perferred core ids: %s\n", perfer_core_id_vec_str.c_str());
num_perfer_cores = static_cast<int>(perfer_core_id_vec.size());
}
use_ime1 = perfer_core_arch_id == spine_core_arch_id::core_arch_a60 ||
perfer_core_arch_id == spine_core_arch_id::core_arch_x100;
use_ime2 = perfer_core_arch_id == spine_core_arch_id::core_arch_a100;
mem_backend = parse_mem_backend(getenv("SPACEMIT_MEM_BACKEND"));
char * spine_disable_tcm_str = getenv("SPACEMIT_DISABLE_TCM");
auto user_disable_tcm = spine_disable_tcm_str != nullptr && strcmp(spine_disable_tcm_str, "0") != 0;
if (!user_disable_tcm) {
spine_mem_pool_tcm_info tcm_info;
if (spine_mem_pool_tcm_init(&tcm_info)) {
use_tcm = tcm_info.available;
tcm_blk_size = tcm_info.blk_size;
GGML_LOG_DEBUG("CPU_RISCV64_SPACEMIT: tcm is available, blk_size: %zu, blk_num: %zu, is_fake_tcm: %d\n",
tcm_info.blk_size, tcm_info.blk_num, tcm_info.is_fake_tcm);
for (auto & core_info : core_info_list) {
auto core_arch_head = (uint16_t) (core_info.arch_id) >> 12;
if (core_arch_head != 0xA) {
aicpu_id_offset++;
} else {
break;
}
}
}
}
GGML_LOG_DEBUG(
"CPU_RISCV64_SPACEMIT: num_cores: %d, num_perfer_cores: %d, perfer_core_arch_id: %x, exclude_main_thread: %d, "
"use_ime1: %d, use_ime2: %d, mem_backend: %s, cpu_mask: %lx, aicpu_id_offset: %d\n",
num_cores, num_perfer_cores, (uint16_t) perfer_core_arch_id, exclude_main_thread, use_ime1, use_ime2,
spine_mem_pool_backend_to_string(mem_backend), cpu_mask, aicpu_id_offset);
const size_t init_barrier_size = sizeof(spine_barrier_t) * spine_init_barrier_count;
init_barrier =
static_cast<spine_barrier_t *>(spine_mem_pool_shared_mem_alloc(init_barrier_size, alignof(spine_barrier_t)));
if (init_barrier != nullptr) {
init_barrier_is_shared_mem = true;
} else {
GGML_LOG_WARN("CPU_RISCV64_SPACEMIT: failed to allocate init_barrier from shared mem, falling back to heap\n",
__func__);
init_barrier = new spine_barrier_t[spine_init_barrier_count];
}
spine_barrier_init(init_barrier, spine_init_barrier_count, 2);
}
spine_env_info::~spine_env_info() {
if (init_barrier_is_shared_mem) {
spine_mem_pool_shared_mem_free(init_barrier);
} else {
delete[] init_barrier;
}
init_barrier = nullptr;
init_barrier_is_shared_mem = false;
}
spine_env_info global_spine_env_info;
} // namespace ggml::cpu::riscv64_spacemit
+55
View File
@@ -0,0 +1,55 @@
#pragma once
#include "spine_barrier.h"
#include "spine_mem_pool.h"
#include <cstddef>
#include <cstdint>
#include <vector>
namespace ggml::cpu::riscv64_spacemit {
constexpr uint64_t spine_invalid_core_id = 0xFFFFFFFF;
constexpr size_t spine_init_barrier_count = 16;
enum class spine_core_arch_id : uint16_t {
core_arch_none = 0,
core_arch_x60 = 0x503C,
core_arch_x100 = 0x5064,
core_arch_x200 = 0x50C8,
core_arch_a60 = 0xA03C,
core_arch_a100 = 0xA064,
core_arch_a200 = 0xA0C8,
};
struct spine_core_info {
uint64_t core_id{ spine_invalid_core_id };
spine_core_arch_id arch_id{ spine_core_arch_id::core_arch_none };
static bool get_spine_core_info(std::vector<spine_core_info> & result);
};
struct spine_env_info {
std::vector<spine_core_info> core_info_list;
std::vector<int> perfer_core_ids;
int aicpu_id_offset{ 0 };
int num_cores{ 0 };
int num_perfer_cores{ 0 };
spine_core_arch_id perfer_core_arch_id{ spine_core_arch_id::core_arch_none };
bool exclude_main_thread{ false };
bool use_ime2{ false };
bool use_ime1{ false };
bool use_tcm{ false };
spine_mem_pool_backend mem_backend{ spine_mem_pool_backend::transparent_hugepage };
uint64_t tcm_blk_size{ 0 };
uint64_t cpu_mask{ 0 };
spine_barrier_t * init_barrier{ nullptr };
bool init_barrier_is_shared_mem{ false };
spine_env_info();
~spine_env_info();
};
extern spine_env_info global_spine_env_info;
} // namespace ggml::cpu::riscv64_spacemit
+180 -17
View File
@@ -1,26 +1,189 @@
#pragma once
#include <cassert>
#include <cstddef>
#include <functional>
namespace spacemit_kernels {
#define BLOCK_QNK_LEN 256
template <int N> struct nrow_block_q2_k {
// [4bit scale + 4bit zp] * N * 16
uint8_t scales[N * BLOCK_QNK_LEN / 16];
// [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
// [b64, b80, b96, b112] ...[b79, b95, b111, b127]
// [b128, b144, b160, b176] ...[b143, b159, b175, b191]
// [b192, b208, b224, b240] ...[b207, b223, b239, b255]
uint8_t qs[N * BLOCK_QNK_LEN / 4];
uint16_t scales16[N];
uint16_t zeros16[N];
};
template <int N> struct nrow_block_q3_k {
// [8bit scale] * N * 16
int8_t scales[N * 16];
// [b0, b1, b2, b3, b4, b5, b6, b7] ... [b248, b249, b250, b251, b252, b253, b254, b255]
uint8_t hmask[N * BLOCK_QNK_LEN / 8];
// [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
// [b64, b80, b96, b112] ...[b79, b95, b111, b127]
// [b128, b144, b160, b176] ...[b143, b159, b175, b191]
// [b192, b208, b224, b240] ...[b207, b223, b239, b255]
uint8_t qs[N * BLOCK_QNK_LEN / 4];
uint16_t scales16[N];
};
template <int N> struct nrow_block_mxfp4 {
uint8_t e[N];
uint8_t qh[4 * N];
uint8_t qs[16 * N];
};
template <int N> struct __attribute__((packed)) nrow_block_q5_1 {
uint16_t scales16[N];
uint8_t zp[N];
// n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
uint8_t qh[4 * N];
// n0 [b0, b1], [b2, b3] .... [b30, b31]
// n1 [b0, b1], [b2, b3] .... [b30, b31]
uint8_t qs[16 * N];
};
static_assert(sizeof(nrow_block_q5_1<1>) == sizeof(uint8_t) + 22, "wrong nrow_block_q5_1 block size/padding");
template <int N> struct __attribute__((packed)) nrow_block_q5_0 {
uint16_t scales16[N];
// n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
uint8_t qh[4 * N];
// n0 [b0, b1], [b2, b3] .... [b30, b31]
// n1 [b0, b1], [b2, b3] .... [b30, b31]
uint8_t qs[16 * N];
};
static_assert(sizeof(nrow_block_q5_0<1>) == 22, "wrong nrow_block_q5_0 block size/padding");
using gemm_kernel_quantize_def = std::function<
size_t(size_t, const uint8_t *, const uint8_t *, const uint8_t *, float *, size_t, size_t, size_t, size_t)>;
using moe_gemm_kernel_quantize_def = std::function<
size_t(size_t, const uint8_t **, const uint8_t *, const uint8_t *, float **, size_t, size_t, size_t, size_t)>;
namespace sqnbitgemm_spacemit_ime {
namespace ime1 {
size_t gemm_kernel_i8i4(size_t blk_len,
const std::byte * quant_a_ptr,
const std::byte * quant_b_data,
const float * quant_b_scale,
const std::byte * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t count_k,
size_t block_count_k,
size_t ldc,
const float * bias,
const size_t scale_stride);
size_t gemm_kernel_i8i4(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
} // namespace ime1
} // namespace sqnbitgemm_spacemit_ime
namespace ime2 {
size_t gemm_kernel_i8i2k(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t gemm_kernel_i8i3k(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t gemm_kernel_i8i4(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t gemm_kernel_i8i4_hp(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t moe_m2_gemm_kernel_i8i4(size_t blk_len,
const uint8_t ** quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float ** c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t gemm_kernel_i8i8(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t gemm_kernel_i8mxfp4(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t moe_m2_gemm_kernel_i8mxfp4(size_t blk_len,
const uint8_t ** quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float ** c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t gemm_kernel_i8i5(size_t blk_len,
const uint8_t * quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
size_t moe_m2_gemm_kernel_i8i5(size_t blk_len,
const uint8_t ** quant_a_ptr,
const uint8_t * quant_b_data,
const uint8_t * quant_b_zp,
float ** c_ptr,
size_t count_m,
size_t count_n,
size_t k_blks,
size_t ldc);
} // namespace ime2
} // namespace spacemit_kernels
File diff suppressed because it is too large Load Diff
+14
View File
@@ -0,0 +1,14 @@
#pragma once
#include "ggml-common.h"
#include "ggml.h"
#include <cstddef>
#include <cstdint>
namespace ggml::cpu::riscv64_spacemit {
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
int repack(ggml_tensor * t, const void * data, size_t data_size);
} // namespace ggml::cpu::riscv64_spacemit
File diff suppressed because it is too large Load Diff
+95
View File
@@ -0,0 +1,95 @@
#pragma once
#include "ggml-cpu-impl.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <functional>
namespace spacemit_kernels {
constexpr auto div_round_up(auto up, auto down) {
return (up + down - 1) / down;
}
// Q8 Blk [f32] [s16] [int8 * blk_len]
// Q8 Blk N [f32 * N] [s16 * N] [int8 * blk_len * N]
constexpr size_t q8_blk_size(size_t blk_len, bool with_blk_sum = false) {
const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + (with_blk_sum ? sizeof(int16_t) : 0);
return blk_size;
}
// Q8 HP row block: K is split into K32 subblocks.
// Each subblock stores [f32 scale] [int8 * 32], with an optional fp16 sum trailer per subblock.
constexpr size_t q8_hp_blk_size(size_t blk_len, bool with_blk_sum = false, bool with_blk_scale = false) {
const size_t subblk_count = div_round_up(blk_len, size_t(32));
const size_t blk_size = blk_len * sizeof(int8_t) + subblk_count * sizeof(_Float16) +
(with_blk_sum ? subblk_count * sizeof(_Float16) : 0) +
(with_blk_scale ? sizeof(_Float16) : 0);
return blk_size;
}
// Q8K Blk [f32] [s16 * (blk_len / 16)] [int8 * blk_len]
// Q8K Blk N [f32 * N] [s16 * (blk_len / 16) * N] [int8 * blk_len * N]
constexpr size_t q8k_blk_size(size_t blk_len) {
const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + sizeof(int16_t) * blk_len / 16;
return blk_size;
}
using quantize_a_row_def = std::function<void(size_t, const float *, size_t, uint8_t *)>;
namespace rvv {
void memcpy1d(void * dst, const void * src, int64_t size);
void memcpy2d(void * dst, int64_t dst_stride, const void * src, int64_t src_stride, int64_t tile_rows, int64_t size);
void forward_flash_attn_ext_f16_one_chunk_vlen1024_vf16(const ggml_compute_params * params,
ggml_tensor * dst,
int ir0,
int ir1,
void * tcm_buffer,
size_t tcm_buffer_size);
void forward_flash_attn_ext_f16_tiled_vlen1024_vf16(const ggml_compute_params * params,
ggml_tensor * dst,
int ir0,
int ir1,
void * tcm_buffer,
size_t tcm_buffer_size);
void forward_rms_norm_f32(ggml_compute_params * params, ggml_tensor * op);
void forward_norm_f32(ggml_compute_params * params, ggml_tensor * op);
void forward_cont_with_permute(ggml_compute_params * params, ggml_tensor * op);
void forward_cpy_with_permute(ggml_compute_params * params, ggml_tensor * op);
template <typename T> void forward_get_rows(ggml_compute_params * params, ggml_tensor * op);
template <typename T> void forward_concat(ggml_compute_params * params, ggml_tensor * op);
template <ggml_op op_type, typename T> void forward_binary(ggml_compute_params * params, ggml_tensor * op);
template <typename T> void forward_sum_rows(const ggml_compute_params * params, ggml_tensor * op);
template <typename T> void forward_repeat_nrows(ggml_compute_params * params, ggml_tensor * op);
template <typename T> void forward_repeat_dim1(ggml_compute_params * params, ggml_tensor * op);
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
void quantize_a_row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
void quantize_a_4row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
void quantize_a_row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
void quantize_a_4row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
} // namespace rvv
} // namespace spacemit_kernels
@@ -0,0 +1,34 @@
#pragma once
#include <atomic>
#include <cstdint>
#define SPINE_CACHE_LINE 64
#define SPINE_CACHE_ALIGN __attribute__((aligned(SPINE_CACHE_LINE)))
struct spine_barrier_t {
SPINE_CACHE_ALIGN std::atomic<int64_t> pending_;
SPINE_CACHE_ALIGN std::atomic<int64_t> rounds_;
SPINE_CACHE_ALIGN int64_t total_;
};
inline void spine_barrier_wait(spine_barrier_t * b) {
auto cur_round = b->rounds_.load(std::memory_order_acquire);
auto cnt = --b->pending_;
if (cnt == 0) {
b->pending_.store(b->total_);
b->rounds_.store(cur_round + 1);
} else {
while (cur_round == b->rounds_.load(std::memory_order_relaxed)) {
__asm__ volatile("pause " ::: "memory");
}
}
}
inline void spine_barrier_init(spine_barrier_t * b, int num_barriers, uint64_t thread_count) {
for (int i = 0; i < num_barriers; i++) {
b[i].total_ = thread_count;
b[i].pending_.store(thread_count);
b[i].rounds_.store(0);
}
}
@@ -0,0 +1,760 @@
#include "spine_mem_pool.h"
#include "common.h"
#include "ime_env.h"
#include "spine_tcm.h"
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <algorithm>
#include <cerrno>
#include <cstdint>
#include <cstdlib>
#include <limits>
#include <memory>
#include <mutex>
#include <unordered_map>
#include <vector>
namespace ggml::cpu::riscv64_spacemit {
namespace {
constexpr size_t SPINE_MEM_POOL_CHUNK_SIZE = 512ull * 1024ull * 1024ull;
constexpr size_t SPINE_SHARE_MEM_POOL_CHUNK_SIZE = 512ull * 1024ull;
constexpr size_t SPINE_MEM_POOL_1G_REGION_SIZE = 1ull << 30;
constexpr uint64_t HUGETLB_1G_FLAG_REQUIRE_PUD = 1ull << 0;
constexpr char SPINE_MEM_POOL_HUGETLB_1G_DEV[] = "/dev/hugetlb_1g";
constexpr char SPINE_MEM_POOL_TCM_SYNC_MEM_DEV[] = "/dev/tcm_sync_mem";
struct hugetlb_1g_region {
uint64_t size{ 0 };
uint64_t dma_addr{ 0 };
uint64_t flags{ 0 };
uint64_t reserved{ 0 };
};
#define HUGETLB_1G_IOC_MAGIC 'M'
#define HUGETLB_1G_IOC_ALLOC _IOWR(HUGETLB_1G_IOC_MAGIC, 0x00, struct hugetlb_1g_region)
#define HUGETLB_1G_IOC_FREE _IO(HUGETLB_1G_IOC_MAGIC, 0x01)
struct free_block {
size_t offset{ 0 };
size_t size{ 0 };
};
struct pool_chunk {
uint8_t * base{ nullptr };
size_t size{ 0 };
int fd{ -1 };
std::vector<free_block> free_blocks;
};
struct pool_allocation {
void * chunk_base{ nullptr };
size_t chunk_size{ 0 };
void * base{ nullptr };
size_t size{ 0 };
};
bool is_power_of_two(size_t value) {
return value != 0 && (value & (value - 1)) == 0;
}
bool align_up(size_t value, size_t alignment, size_t * aligned_value) {
if (aligned_value == nullptr || alignment == 0) {
return false;
}
const size_t remainder = value % alignment;
if (remainder == 0) {
*aligned_value = value;
return true;
}
const size_t padding = alignment - remainder;
if (value > std::numeric_limits<size_t>::max() - padding) {
return false;
}
*aligned_value = value + padding;
return true;
}
bool align_up_uintptr(uintptr_t value, size_t alignment, uintptr_t * aligned_value) {
if (aligned_value == nullptr || alignment == 0) {
return false;
}
const uintptr_t remainder = value % alignment;
if (remainder == 0) {
*aligned_value = value;
return true;
}
const uintptr_t padding = alignment - remainder;
if (value > std::numeric_limits<uintptr_t>::max() - padding) {
return false;
}
*aligned_value = value + padding;
return true;
}
class spine_mem_pool_manager {
public:
explicit spine_mem_pool_manager(size_t default_chunk_size) : default_chunk_size_(default_chunk_size) {}
virtual ~spine_mem_pool_manager() = default;
void * alloc(size_t size, size_t alignment) {
if (size == 0 || !is_power_of_two(alignment)) {
return nullptr;
}
size_t aligned_size = 0;
if (!align_up(size, alignment, &aligned_size)) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: align_up failed for size %zu alignment %zu\n", __func__, size,
alignment);
return nullptr;
}
pool_allocation allocation;
std::lock_guard<std::mutex> lock(mutex_);
if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
if (!add_chunk_locked(aligned_size, alignment)) {
return nullptr;
}
if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation retry failed for size %zu alignment %zu\n",
__func__, aligned_size, alignment);
return nullptr;
}
}
try {
const auto [allocation_it, inserted] = allocations_.emplace(allocation.base, allocation);
if (!inserted) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: duplicate allocation key %p\n", __func__, allocation.base);
rollback_allocation_locked(allocation);
return nullptr;
}
} catch (const std::bad_alloc &) {
rollback_allocation_locked(allocation);
throw;
}
return allocation.base;
}
void free(void * base) {
if (base == nullptr) {
return;
}
std::lock_guard<std::mutex> lock(mutex_);
auto allocation_it = allocations_.find(base);
if (allocation_it == allocations_.end()) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown allocation %p\n", __func__, base);
return;
}
pool_allocation allocation = allocation_it->second;
allocations_.erase(allocation_it);
auto chunk_it = find_chunk_locked(allocation);
if (chunk_it == chunks_.end()) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown chunk for allocation %p size %zu\n", __func__,
allocation.base, allocation.size);
return;
}
auto * chunk_base = chunk_it->base;
auto * alloc_base = static_cast<uint8_t *>(allocation.base);
if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p out of chunk range %p..%p\n", __func__,
allocation.base, chunk_base, chunk_base + chunk_it->size);
return;
}
const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p size %zu exceeds chunk size %zu\n", __func__,
allocation.base, allocation.size, chunk_it->size);
return;
}
insert_free_block_locked(*chunk_it, { offset, allocation.size });
maybe_release_empty_chunk_locked(chunk_it);
}
protected:
void release_chunks() {
std::lock_guard<std::mutex> lock(mutex_);
allocations_.clear();
for (auto & chunk : chunks_) {
dealloc_chunk(&chunk);
}
chunks_.clear();
}
size_t default_chunk_size() const { return default_chunk_size_; }
static void clear_chunk(pool_chunk * chunk) {
chunk->base = nullptr;
chunk->size = 0;
chunk->fd = -1;
chunk->free_blocks.clear();
}
virtual bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) = 0;
virtual void dealloc_chunk(pool_chunk * chunk) = 0;
private:
struct alloc_candidate {
size_t chunk_index{ 0 };
size_t block_index{ 0 };
size_t aligned_offset{ 0 };
uintptr_t address{ std::numeric_limits<uintptr_t>::max() };
bool valid{ false };
};
std::vector<pool_chunk>::iterator find_chunk_locked(const pool_allocation & allocation) {
return std::find_if(chunks_.begin(), chunks_.end(), [&](const pool_chunk & chunk) {
return chunk.base == allocation.chunk_base && chunk.size == allocation.chunk_size;
});
}
bool add_chunk_locked(size_t min_size, size_t alignment) {
pool_chunk chunk;
const size_t chunk_request = default_chunk_size_ == 0 ? min_size : std::max(min_size, default_chunk_size_);
void * hint_addr = nullptr;
for (const auto & existing_chunk : chunks_) {
auto * chunk_end = existing_chunk.base + existing_chunk.size;
if (hint_addr == nullptr || chunk_end > hint_addr) {
hint_addr = chunk_end;
}
}
if (!alloc_chunk(chunk_request, alignment, hint_addr, &chunk)) {
return false;
}
if (chunk.base == nullptr || chunk.size < min_size) {
GGML_LOG_ERROR(
"CPU_RISCV64_SPACEMIT: %s: invalid chunk returned for request size %zu, chunk_base=%p chunk_size=%zu\n",
__func__, min_size, chunk.base, chunk.size);
dealloc_chunk(&chunk);
return false;
}
try {
chunk.free_blocks.push_back({ 0, chunk.size });
chunks_.push_back(std::move(chunk));
} catch (const std::bad_alloc &) {
dealloc_chunk(&chunk);
throw;
}
return true;
}
void rollback_allocation_locked(const pool_allocation & allocation) {
auto chunk_it = find_chunk_locked(allocation);
if (chunk_it == chunks_.end()) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, owning chunk not found\n",
__func__, allocation.base);
return;
}
auto * chunk_base = chunk_it->base;
auto * alloc_base = static_cast<uint8_t *>(allocation.base);
if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, chunk range is invalid\n",
__func__, allocation.base);
return;
}
const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p size %zu\n", __func__,
allocation.base, allocation.size);
return;
}
insert_free_block_locked(*chunk_it, { offset, allocation.size });
maybe_release_empty_chunk_locked(chunk_it);
}
bool try_alloc_locked(size_t size, size_t alignment, pool_allocation * allocation) {
alloc_candidate best;
for (size_t chunk_index = 0; chunk_index < chunks_.size(); ++chunk_index) {
const auto & chunk = chunks_[chunk_index];
for (size_t block_index = 0; block_index < chunk.free_blocks.size(); ++block_index) {
const auto & block = chunk.free_blocks[block_index];
uintptr_t aligned_addr = 0;
const auto block_addr = reinterpret_cast<uintptr_t>(chunk.base + block.offset);
if (!align_up_uintptr(block_addr, alignment, &aligned_addr)) {
continue;
}
if (aligned_addr < block_addr) {
continue;
}
const size_t aligned_offset = block.offset + static_cast<size_t>(aligned_addr - block_addr);
const size_t padding = aligned_offset - block.offset;
if (padding > block.size || size > block.size - padding) {
continue;
}
if (!best.valid || aligned_addr < best.address) {
best.chunk_index = chunk_index;
best.block_index = block_index;
best.aligned_offset = aligned_offset;
best.address = aligned_addr;
best.valid = true;
}
}
}
if (!best.valid) {
return false;
}
auto & chunk = chunks_[best.chunk_index];
const free_block block = chunk.free_blocks[best.block_index];
const size_t padding = best.aligned_offset - block.offset;
const size_t alloc_end = best.aligned_offset + size;
const size_t block_end = block.offset + block.size;
chunk.free_blocks.erase(chunk.free_blocks.begin() + best.block_index);
auto insert_it = chunk.free_blocks.begin() + best.block_index;
if (padding != 0) {
insert_it = chunk.free_blocks.insert(insert_it, { block.offset, padding });
++insert_it;
}
if (alloc_end < block_end) {
chunk.free_blocks.insert(insert_it, { alloc_end, block_end - alloc_end });
}
allocation->chunk_base = chunk.base;
allocation->chunk_size = chunk.size;
allocation->base = chunk.base + best.aligned_offset;
allocation->size = size;
return true;
}
void maybe_release_empty_chunk_locked(std::vector<pool_chunk>::iterator chunk_it) {
if (chunk_it->free_blocks.size() != 1) {
return;
}
const auto & block = chunk_it->free_blocks.front();
if (block.offset != 0 || block.size != chunk_it->size) {
return;
}
dealloc_chunk(&*chunk_it);
chunks_.erase(chunk_it);
}
void insert_free_block_locked(pool_chunk & chunk, free_block block) {
auto it = chunk.free_blocks.begin();
while (it != chunk.free_blocks.end() && it->offset < block.offset) {
++it;
}
if (it != chunk.free_blocks.begin()) {
const auto & prev = *(it - 1);
if (prev.offset + prev.size > block.offset) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping free block at offset %zu size %zu\n", __func__,
block.offset, block.size);
return;
}
}
if (it != chunk.free_blocks.end() && block.offset + block.size > it->offset) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping next free block at offset %zu size %zu\n", __func__,
block.offset, block.size);
return;
}
it = chunk.free_blocks.insert(it, block);
if (it != chunk.free_blocks.begin()) {
auto prev = it - 1;
if (prev->offset + prev->size == it->offset) {
it->offset = prev->offset;
it->size += prev->size;
it = chunk.free_blocks.erase(prev);
}
}
if (it + 1 != chunk.free_blocks.end() && it->offset + it->size == (it + 1)->offset) {
it->size += (it + 1)->size;
chunk.free_blocks.erase(it + 1);
}
}
std::mutex mutex_;
std::vector<pool_chunk> chunks_;
std::unordered_map<void *, pool_allocation> allocations_;
size_t default_chunk_size_{ 0 };
};
class spine_mem_pool_posix final : public spine_mem_pool_manager {
public:
spine_mem_pool_posix() : spine_mem_pool_manager(0) {}
~spine_mem_pool_posix() override { release_chunks(); }
private:
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
(void) hint_addr;
const size_t alloc_alignment = std::max(alignment, sizeof(void *));
void * base = nullptr;
const int rc = posix_memalign(&base, alloc_alignment, min_size);
if (rc != 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: posix_memalign failed for size %zu alignment %zu, rc=%d\n",
__func__, min_size, alloc_alignment, rc);
return false;
}
chunk->base = static_cast<uint8_t *>(base);
chunk->size = min_size;
chunk->fd = -1;
return true;
}
void dealloc_chunk(pool_chunk * chunk) override {
std::free(chunk->base);
clear_chunk(chunk);
}
};
class spine_mem_pool_transparent_hugepage final : public spine_mem_pool_manager {
public:
spine_mem_pool_transparent_hugepage() : spine_mem_pool_manager(SPINE_MEM_POOL_CHUNK_SIZE) {}
~spine_mem_pool_transparent_hugepage() override { release_chunks(); }
private:
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
(void) alignment;
size_t chunk_size = 0;
if (!align_up(min_size, default_chunk_size(), &chunk_size)) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round chunk size for %zu\n", __func__, min_size);
return false;
}
void * map_addr = mmap(hint_addr, chunk_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (map_addr == MAP_FAILED) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for chunk size %zu, errno=%d\n", __func__, chunk_size,
errno);
return false;
}
if (madvise(map_addr, chunk_size, MADV_HUGEPAGE) != 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: madvise(MADV_HUGEPAGE) failed for chunk size %zu, errno=%d\n",
__func__, chunk_size, errno);
munmap(map_addr, chunk_size);
return false;
}
chunk->base = static_cast<uint8_t *>(map_addr);
chunk->size = chunk_size;
chunk->fd = -1;
return true;
}
void dealloc_chunk(pool_chunk * chunk) override {
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for chunk %p size %zu, errno=%d\n", __func__,
chunk->base, chunk->size, errno);
}
clear_chunk(chunk);
}
};
class spine_mem_pool_hugetlb_1g final : public spine_mem_pool_manager {
public:
spine_mem_pool_hugetlb_1g() : spine_mem_pool_manager(SPINE_MEM_POOL_1G_REGION_SIZE) {}
~spine_mem_pool_hugetlb_1g() override { release_chunks(); }
private:
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
(void) alignment;
(void) hint_addr;
size_t region_size = 0;
if (!align_up(min_size, SPINE_MEM_POOL_1G_REGION_SIZE, &region_size)) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round hugetlb_1g size for %zu\n", __func__, min_size);
return false;
}
const int fd = open(SPINE_MEM_POOL_HUGETLB_1G_DEV, O_RDWR);
if (fd < 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
SPINE_MEM_POOL_HUGETLB_1G_DEV, errno);
return false;
}
hugetlb_1g_region region;
region.size = region_size;
region.flags = HUGETLB_1G_FLAG_REQUIRE_PUD;
if (ioctl(fd, HUGETLB_1G_IOC_ALLOC, &region) < 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_ALLOC failed for size %zu, errno=%d\n", __func__,
region_size, errno);
close(fd);
return false;
}
void * map_addr = mmap(nullptr, region.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (map_addr == MAP_FAILED) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for hugetlb_1g size %llu, errno=%d\n", __func__,
static_cast<unsigned long long>(region.size), errno);
ioctl(fd, HUGETLB_1G_IOC_FREE);
close(fd);
return false;
}
chunk->base = static_cast<uint8_t *>(map_addr);
chunk->size = region.size;
chunk->fd = fd;
return true;
}
void dealloc_chunk(pool_chunk * chunk) override {
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for hugetlb_1g chunk %p size %zu, errno=%d\n",
__func__, chunk->base, chunk->size, errno);
}
if (chunk->fd >= 0) {
if (ioctl(chunk->fd, HUGETLB_1G_IOC_FREE) < 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_FREE failed for chunk %p, errno=%d\n",
__func__, chunk->base, errno);
}
close(chunk->fd);
}
clear_chunk(chunk);
}
};
class spine_mem_pool_shared_mem final : public spine_mem_pool_manager {
public:
spine_mem_pool_shared_mem() : spine_mem_pool_manager(SPINE_SHARE_MEM_POOL_CHUNK_SIZE) {}
~spine_mem_pool_shared_mem() override { release_chunks(); }
private:
bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
(void) alignment;
if (hint_addr != nullptr) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem does not support multiple active chunks\n", __func__);
return false;
}
if (min_size > default_chunk_size()) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem request %zu exceeds chunk size %zu\n", __func__,
min_size, default_chunk_size());
return false;
}
const int fd = open(SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, O_RDWR | O_SYNC);
if (fd < 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, errno);
return false;
}
void * map_addr = mmap(nullptr, default_chunk_size(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (map_addr == MAP_FAILED) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for %s size %zu, errno=%d\n", __func__,
SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, default_chunk_size(), errno);
close(fd);
return false;
}
chunk->base = static_cast<uint8_t *>(map_addr);
chunk->size = default_chunk_size();
chunk->fd = fd;
return true;
}
void dealloc_chunk(pool_chunk * chunk) override {
if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for shared_mem chunk %p size %zu, errno=%d\n",
__func__, chunk->base, chunk->size, errno);
}
if (chunk->fd >= 0) {
close(chunk->fd);
}
clear_chunk(chunk);
}
};
spine_mem_pool_manager & get_spine_mem_pool_manager() {
static std::once_flag pool_once;
static std::unique_ptr<spine_mem_pool_manager> selected_pool;
static spine_mem_pool_backend selected_backend = spine_mem_pool_backend::none;
spine_mem_pool_backend backend = global_spine_env_info.mem_backend;
if (backend == spine_mem_pool_backend::none) {
backend = spine_mem_pool_backend::transparent_hugepage;
}
std::call_once(pool_once, [&]() {
selected_backend = backend;
switch (selected_backend) {
case spine_mem_pool_backend::posix_memalign:
selected_pool = std::make_unique<spine_mem_pool_posix>();
break;
case spine_mem_pool_backend::transparent_hugepage:
selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
break;
case spine_mem_pool_backend::hugetlb_1g:
selected_pool = std::make_unique<spine_mem_pool_hugetlb_1g>();
break;
case spine_mem_pool_backend::none:
selected_backend = spine_mem_pool_backend::transparent_hugepage;
selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
break;
}
});
if (backend != selected_backend) {
GGML_LOG_ERROR(
"CPU_RISCV64_SPACEMIT: %s: mem pool backend is process-global and mutually exclusive, requested=%d but "
"selected=%d\n",
__func__, static_cast<int>(backend), static_cast<int>(selected_backend));
}
if (selected_pool) {
return *selected_pool;
}
throw std::bad_alloc();
}
spine_mem_pool_manager & get_spine_mem_pool_shared_mem_manager() {
static std::once_flag shared_mem_pool_once;
static std::unique_ptr<spine_mem_pool_shared_mem> shared_mem_pool;
std::call_once(shared_mem_pool_once, [&]() { shared_mem_pool = std::make_unique<spine_mem_pool_shared_mem>(); });
if (shared_mem_pool) {
return *shared_mem_pool;
}
throw std::bad_alloc();
}
} // namespace
bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept {
if (info == nullptr) {
return false;
}
*info = {};
if (spine_tcm_open_handle(NULL) != 0 || !spine_tcm_is_available()) {
return false;
}
spine_tcm_mem_info_t mem_info;
if (spine_tcm_mem_info(&mem_info) != 0) {
return false;
}
info->available = true;
info->blk_size = mem_info.blk_size;
info->blk_num = mem_info.blk_num;
info->is_fake_tcm = mem_info.is_fake_tcm != 0;
return true;
}
void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept {
return spine_tcm_mem_get(cpu_id);
}
void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept {
return spine_tcm_mem_try_wait(cpu_id, 1000 * 1000);
}
int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept {
return spine_tcm_mem_release(cpu_id);
}
void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept {
try {
return get_spine_mem_pool_manager().alloc(size, alignment);
} catch (const std::bad_alloc &) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating size %zu\n", __func__, size);
return nullptr;
}
}
void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept {
try {
return get_spine_mem_pool_shared_mem_manager().alloc(size, alignment);
} catch (const std::bad_alloc &) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating shared memory size %zu\n", __func__, size);
return nullptr;
}
}
void spine_mem_pool_free(void * base) noexcept {
try {
get_spine_mem_pool_manager().free(base);
} catch (const std::bad_alloc &) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing allocation %p\n", __func__, base);
}
}
void spine_mem_pool_shared_mem_free(void * base) noexcept {
try {
get_spine_mem_pool_shared_mem_manager().free(base);
} catch (const std::bad_alloc &) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing shared allocation %p\n", __func__, base);
}
}
} // namespace ggml::cpu::riscv64_spacemit
extern "C" {
void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment) {
void * result = ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_alloc(size, alignment);
if (result == nullptr) {
GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to allocate shared memory size %zu alignment %zu\n", __func__,
size, alignment);
}
return result;
}
void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr) {
ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_free(ptr);
}
}
@@ -0,0 +1,32 @@
#pragma once
#include <cstddef>
#include <cstdint>
namespace ggml::cpu::riscv64_spacemit {
enum class spine_mem_pool_backend : uint8_t {
none,
posix_memalign,
transparent_hugepage,
hugetlb_1g,
};
struct spine_mem_pool_tcm_info {
bool available{ false };
size_t blk_size{ 0 };
size_t blk_num{ 0 };
bool is_fake_tcm{ false };
};
bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept;
void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept;
void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept;
int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept;
void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept;
void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept;
void spine_mem_pool_free(void * base) noexcept;
void spine_mem_pool_shared_mem_free(void * base) noexcept;
} // namespace ggml::cpu::riscv64_spacemit
+409
View File
@@ -0,0 +1,409 @@
#ifndef SPINE_TCM_PUBLIC_H_
#define SPINE_TCM_PUBLIC_H_
/*
* spine_tcm public API
*
* Usage:
* 1. Direct link mode
* Define SPINE_TCM_DIRECT_LINK and link against libspine_tcm.so.
*
* if (spine_tcm_is_available()) {
* void *buffer = spine_tcm_mem_get(0);
* spine_tcm_mem_free(0);
* }
*
* 2. Header-only loader mode
* Include this header without linking libspine_tcm.so. The loader first
* tries to reuse a process-global spine_tcm instance and falls back to
* dlopen("libspine_tcm.so") when needed.
*
* spine_tcm_open_handle(NULL); // optional pre-bind
* if (spine_tcm_is_available()) {
* void *buffer = spine_tcm_mem_get(0);
* spine_tcm_mem_free(0);
* }
*/
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#if !defined(SPINE_TCM_BUILD_SHARED) && !defined(SPINE_TCM_DIRECT_LINK)
# include <dlfcn.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_WIN32)
# if defined(SPINE_TCM_BUILD_SHARED)
# define SPINE_TCM_API __declspec(dllexport)
# else
# define SPINE_TCM_API __declspec(dllimport)
# endif
#else
# define SPINE_TCM_API __attribute__((visibility("default")))
#endif
typedef struct spine_tcm_mem_info {
size_t blk_size;
size_t blk_num;
int is_fake_tcm;
} spine_tcm_mem_info_t;
typedef struct spine_tcm_block_info {
int id;
void * va;
size_t size;
uint64_t phys_addr;
uint64_t cpu_affinity_mask;
int owner_tid;
int is_acquired;
} spine_tcm_block_info_t;
/* Shared-library runtime ABI exported by libspine_tcm.so. */
SPINE_TCM_API const char * spine_tcm_runtime_version(void);
SPINE_TCM_API int spine_tcm_runtime_is_available(void);
SPINE_TCM_API int spine_tcm_runtime_layout_info(spine_tcm_mem_info_t * info);
SPINE_TCM_API int spine_tcm_runtime_mem_info(int id, spine_tcm_block_info_t * info);
SPINE_TCM_API void * spine_tcm_runtime_mem_get(int id);
SPINE_TCM_API int spine_tcm_runtime_mem_free(int id);
SPINE_TCM_API void * spine_tcm_runtime_mem_try_wait(int id, size_t timeout_us);
SPINE_TCM_API int spine_tcm_runtime_mem_release(int id);
SPINE_TCM_API int spine_tcm_runtime_mem_force_release(int id);
SPINE_TCM_API int spine_tcm_runtime_mem_query(int id);
#if defined(SPINE_TCM_DIRECT_LINK)
/* Optional no-op in direct-link mode. */
static inline int spine_tcm_open_handle(const char * so_path) {
(void) so_path;
return 0;
}
static inline const char * spine_tcm_version(void) {
return spine_tcm_runtime_version();
}
/* Returns 1 when the runtime driver is available, otherwise 0. */
static inline int spine_tcm_is_available(void) {
return spine_tcm_runtime_is_available();
}
/* Returns runtime memory geometry and whether the current backend is fake TCM. */
static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
return spine_tcm_runtime_layout_info(info);
}
/* Returns per-block runtime metadata for the given TCM id. */
static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
return spine_tcm_runtime_mem_info(id, info);
}
/* Returns a cached buffer for the given TCM id, or NULL on failure. */
static inline void * spine_tcm_mem_get(int id) {
return spine_tcm_runtime_mem_get(id);
}
/* Releases one reference acquired by spine_tcm_mem_get(id). */
static inline int spine_tcm_mem_free(int id) {
return spine_tcm_runtime_mem_free(id);
}
/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
return spine_tcm_runtime_mem_try_wait(id, over_time);
}
/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
static inline int spine_tcm_mem_release(int id) {
return spine_tcm_runtime_mem_release(id);
}
/* Forces a release for the given TCM id when the backend supports it. */
static inline int spine_tcm_mem_force_release(int id) {
return spine_tcm_runtime_mem_force_release(id);
}
/* Returns whether the given TCM id is currently acquired. */
static inline int spine_tcm_mem_query(int id) {
return spine_tcm_runtime_mem_query(id);
}
#elif !defined(SPINE_TCM_BUILD_SHARED)
typedef struct spine_tcm_handle {
void * module_handle;
int use_global_scope;
int owns_module_handle;
const char * (*runtime_version)(void);
int (*runtime_is_available)(void);
int (*runtime_layout_info)(spine_tcm_mem_info_t * info);
int (*runtime_mem_info)(int id, spine_tcm_block_info_t * info);
void * (*runtime_mem_get)(int id);
int (*runtime_mem_free)(int id);
void * (*runtime_mem_try_wait)(int id, size_t over_time);
int (*runtime_mem_release)(int id);
int (*runtime_mem_force_release)(int id);
int (*runtime_mem_query)(int id);
} spine_tcm_handle_t;
static inline spine_tcm_handle_t * spine_tcm_default_handle(void) {
static spine_tcm_handle_t handle = { 0 };
return &handle;
}
static inline void spine_tcm_handle_reset(spine_tcm_handle_t * handle) {
if (handle != NULL) {
memset(handle, 0, sizeof(*handle));
}
}
static inline int spine_tcm_handle_bind(spine_tcm_handle_t * handle) {
void * symbol_scope = handle->use_global_scope ? RTLD_DEFAULT : handle->module_handle;
handle->runtime_version = (const char * (*) (void) ) dlsym(symbol_scope, "spine_tcm_runtime_version");
handle->runtime_is_available = (int (*)(void)) dlsym(symbol_scope, "spine_tcm_runtime_is_available");
handle->runtime_layout_info =
(int (*)(spine_tcm_mem_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_layout_info");
handle->runtime_mem_info =
(int (*)(int, spine_tcm_block_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_mem_info");
handle->runtime_mem_get = (void * (*) (int) ) dlsym(symbol_scope, "spine_tcm_runtime_mem_get");
handle->runtime_mem_free = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_free");
handle->runtime_mem_try_wait = (void * (*) (int, size_t)) dlsym(symbol_scope, "spine_tcm_runtime_mem_try_wait");
handle->runtime_mem_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_release");
handle->runtime_mem_force_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_force_release");
handle->runtime_mem_query = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_query");
return handle->runtime_version != NULL && handle->runtime_is_available != NULL &&
handle->runtime_layout_info != NULL && handle->runtime_mem_info != NULL &&
handle->runtime_mem_get != NULL && handle->runtime_mem_free != NULL &&
handle->runtime_mem_try_wait != NULL && handle->runtime_mem_release != NULL &&
handle->runtime_mem_force_release != NULL && handle->runtime_mem_query != NULL ?
0 :
-1;
}
/*
* Try to bind against an already-loaded process-global spine_tcm instance.
* The shared library exports spine_tcm_runtime_marker only for this probe.
*/
static inline int spine_tcm_try_bind_global(spine_tcm_handle_t * handle) {
if (dlsym(RTLD_DEFAULT, "spine_tcm_runtime_marker") == NULL) {
return -1;
}
handle->use_global_scope = 1;
return spine_tcm_handle_bind(handle);
}
/*
* Optional pre-bind entry point.
*
* Behavior:
* - Reuses an already-loaded global spine_tcm instance when available.
* - Otherwise loads the shared library from so_path or the default soname.
* - Repeated calls are safe and return 0 after the first successful bind.
*/
static inline int spine_tcm_open_handle(const char * so_path) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
const char * library = (so_path != NULL && so_path[0] != '\0') ? so_path : "libspine_tcm.so";
if (resolved->module_handle != NULL || resolved->use_global_scope) {
return 0;
}
if (spine_tcm_try_bind_global(resolved) == 0) {
return 0;
}
spine_tcm_handle_reset(resolved);
resolved->module_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL);
resolved->owns_module_handle = resolved->module_handle != NULL ? 1 : 0;
if (resolved->module_handle == NULL) {
spine_tcm_handle_reset(resolved);
return -1;
}
if (spine_tcm_handle_bind(resolved) != 0) {
if (resolved->owns_module_handle) {
dlclose(resolved->module_handle);
}
spine_tcm_handle_reset(resolved);
return -1;
}
return 0;
}
/* Returns 1 when the runtime driver is available, otherwise 0. */
static inline int spine_tcm_is_available(void) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_is_available == NULL) {
return 0;
}
return resolved->runtime_is_available();
}
/* Returns runtime memory geometry and whether the current backend is fake TCM. */
static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_layout_info == NULL) {
return -1;
}
return resolved->runtime_layout_info(info);
}
static inline const char * spine_tcm_version(void) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_version == NULL) {
return "unknown";
}
return resolved->runtime_version();
}
/* Returns per-block runtime metadata for the given TCM id. */
static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_info == NULL) {
return -1;
}
return resolved->runtime_mem_info(id, info);
}
/* Returns a cached buffer for the given TCM id, or NULL on failure. */
static inline void * spine_tcm_mem_get(int id) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
return NULL;
}
if (resolved->runtime_mem_get == NULL) {
return NULL;
}
return resolved->runtime_mem_get(id);
}
/* Releases one reference acquired by spine_tcm_mem_get(id). */
static inline int spine_tcm_mem_free(int id) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_free == NULL) {
return -1;
}
return resolved->runtime_mem_free(id);
}
/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
return NULL;
}
if (resolved->runtime_mem_try_wait == NULL) {
return NULL;
}
return resolved->runtime_mem_try_wait(id, over_time);
}
/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
static inline int spine_tcm_mem_release(int id) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_release == NULL) {
return -1;
}
return resolved->runtime_mem_release(id);
}
/* Forces a release for the given TCM id when the backend supports it. */
static inline int spine_tcm_mem_force_release(int id) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) ||
resolved->runtime_mem_force_release == NULL) {
return -1;
}
return resolved->runtime_mem_force_release(id);
}
/* Returns whether the given TCM id is currently acquired. */
static inline int spine_tcm_mem_query(int id) {
spine_tcm_handle_t * resolved = spine_tcm_default_handle();
if (resolved->module_handle == NULL && !resolved->use_global_scope) {
(void) spine_tcm_open_handle(NULL);
}
if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_query == NULL) {
return -1;
}
return resolved->runtime_mem_query(id);
}
#else
static inline const char * spine_tcm_version(void) {
return spine_tcm_runtime_version();
}
#endif
#define SPINE_TCM_VERSION (spine_tcm_version())
#ifdef __cplusplus
}
#endif
#endif