ggml-cpu: Add IME2 Instruction Support for the SpacemiT Backend (#22863)

2026-06-09 07:16:44 +02:00 · 2026-05-14 17:39:30 +08:00
parent 0f45f1a35c
commit 81b0d882ae
21 changed files with 14732 additions and 3477 deletions
@@ -301,16 +301,17 @@ jobs:
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DGGML_CPU_REPACK=OFF \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
                         -DGGML_RVV=ON \
+                         -DGGML_RV_ZVFH=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
                         -DGGML_RV_ZIHINTPAUSE=ON \
-                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+                         -DGGML_RV_ZBA=ON \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

          cmake --build build --config Release -j $(nproc)
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
@@ -9,18 +9,20 @@ wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_6
 ~~~

 2. Build
-Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
+Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1` and `RISCV64_SPACEMIT_IME2`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
 ```bash

 cmake -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_CPU_RISCV64_SPACEMIT=ON \
+    -DGGML_CPU_REPACK=OFF \
    -DLLAMA_OPENSSL=OFF \
    -DGGML_RVV=ON \
+    -DGGML_RV_ZVFH=ON \
    -DGGML_RV_ZFH=ON \
    -DGGML_RV_ZICBOP=ON \
    -DGGML_RV_ZIHINTPAUSE=ON \
-    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+    -DGGML_RV_ZBA=ON \
    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
    -DCMAKE_INSTALL_PREFIX=build/installed

@@ -47,8 +49,25 @@ export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}

 ${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
 ~~~
+
+## Quantization Support For Matrix
+
+| Quantization Type | X60 | A100 |
+| ---: | ---: | ---: |
+| Q2_K |  | :heavy_check_mark: |
+| Q3_K |  | :heavy_check_mark: |
+| Q4_0 | :heavy_check_mark: | :heavy_check_mark: |
+| Q4_1 | :heavy_check_mark: | :heavy_check_mark: |
+| Q4_K | :heavy_check_mark: | :heavy_check_mark: |
+| Q5_0 |  | :heavy_check_mark: |
+| Q5_1 |  | :heavy_check_mark: |
+| Q5_K |  | :heavy_check_mark: |
+| Q6_K |  | :heavy_check_mark: |
+| Q8_0 |  | :heavy_check_mark: |
+
+
 ## Performance
-#### Quantization Support For Matrix
+* Spacemit(R) X60
 ~~~
 model name      : Spacemit(R) X60
 isa             : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
@@ -58,33 +77,34 @@ mvendorid       : 0x710
 marchid         : 0x8000000058000001
 ~~~

-Q4_0
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | pp512|64.12 ± 0.26|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | tg128|10.03 ± 0.01|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | pp512|24.16 ± 0.02|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | tg128|3.83 ± 0.06|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | pp512|12.08 ± 0.02|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | tg128|2.23 ± 0.02|
-
-Q4_1
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | pp512|62.07 ± 0.12|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | tg128|9.91 ± 0.01|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | pp512|22.95 ± 0.25|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | tg128|4.01 ± 0.15|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | pp512|11.55 ± 0.16|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | tg128|2.25 ± 0.04|
+| model                          |       size |     params | backend    | threads | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       4 |      128 |  1 |    0 |           pp128 |         10.32 ± 0.02 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       4 |      128 |  1 |    0 |           tg128 |          3.07 ± 0.01 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       4 |      128 |  1 |    0 |           pp128 |         49.15 ± 0.25 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       4 |      128 |  1 |    0 |           tg128 |         11.73 ± 0.02 |


-Q4_K
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | pp512|9.29 ± 0.05|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | tg128|5.67 ± 0.04|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | pp512|10.38 ± 0.10|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | tg128|3.17 ± 0.08|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | pp512|4.23 ± 0.04|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | tg128|1.73 ± 0.00|
+* Spacemit(R) A100
+~~~
+model name      : Spacemit(R) A100
+isa             : rv64imafdcvh_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
+mmu             : sv39
+mvendorid       : 0x710
+marchid         : 0x8000000041000002
+mimpid          : 0x10000000d5686200
+hart isa        : rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_zicsr_zifencei_zihintntl_zihintpause_zihpm_zimop_zaamo_zalrsc_zawrs_zfa_zfh_zfhmin_zca_zcb_zcd_zcmop_zba_zbb_zbc_zbs_zkt_zvbb_zvbc_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkb_zvkg_zvkned_zvknha_zvknhb_zvksed_zvksh_zvkt_smaia_smstateen_ssaia_sscofpmf_sstc_svinval_svnapot_svpbmt_sdtrig
+~~~
+
+| model                          |       size |     params | backend    | threads | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       8 |      128 |  1 |    0 |           pp128 |        565.83 ± 0.31 |
+| qwen3 0.6B Q4_0                | 358.78 MiB |   596.05 M | CPU        |       8 |      128 |  1 |    0 |           tg128 |         55.77 ± 0.02 |
+| qwen3 4B Q4_0                  |   2.21 GiB |     4.02 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         79.74 ± 0.04 |
+| qwen3 4B Q4_0                  |   2.21 GiB |     4.02 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         11.29 ± 0.00 |
+| qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         57.88 ± 0.31 |
+| qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         12.79 ± 0.00 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |        115.23 ± 0.04 |
+| qwen35 2B Q4_1                 |   1.19 GiB |     1.88 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |         16.49 ± 0.01 |
+| gemma4 E4B Q4_K - Medium       |   4.76 GiB |     7.52 B | CPU        |       8 |      128 |  1 |    0 |           pp128 |         21.13 ± 0.01 |
+| gemma4 E4B Q4_K - Medium       |   4.76 GiB |     7.52 B | CPU        |       8 |      128 |  1 |    0 |           tg128 |          5.66 ± 0.00 |
@@ -450,12 +450,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ggml-cpu/arch/riscv/repack.cpp
            )
        if (GGML_CPU_RISCV64_SPACEMIT)
+            include(ggml-cpu/cmake/FindSMTIME.cmake)
            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
            list(APPEND GGML_CPU_SOURCES
                ggml-cpu/spacemit/ime.cpp
                ggml-cpu/spacemit/ime.h
+                ggml-cpu/spacemit/spine_mem_pool.cpp
+                ggml-cpu/spacemit/spine_mem_pool.h
+                ggml-cpu/spacemit/repack.cpp
+                ggml-cpu/spacemit/repack.h
+                ggml-cpu/spacemit/ime_env.cpp
+                ggml-cpu/spacemit/ime_env.h
                ggml-cpu/spacemit/ime1_kernels.cpp
+                ggml-cpu/spacemit/ime2_kernels.cpp
                ggml-cpu/spacemit/ime_kernels.h
+                ggml-cpu/spacemit/rvv_kernels.cpp
+                ggml-cpu/spacemit/rvv_kernels.h
            )
        endif()
        if(NOT GGML_CPU_ALL_VARIANTS)
@@ -485,6 +495,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            if (GGML_RV_ZIHINTPAUSE)
                string(APPEND MARCH_STR "_zihintpause")
            endif()
+            if (GGML_RV_ZBA)
+                string(APPEND MARCH_STR "_zba")
+            endif()
            if (GGML_CPU_RISCV64_SPACEMIT)
                # `xsmtvdotii' is only required for GCC >= 15.
                if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
@@ -0,0 +1,32 @@
+include(CheckCSourceRuns)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)" AND GGML_CPU_RISCV64_SPACEMIT)
+    set(SMT_MARCH_STR "-march=rv64gcv_zfh_zvfh_zba_zicbop")
+    if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+        CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
+        string(APPEND SMT_MARCH_STR "_xsmtvdotii")
+    endif()
+    set(CMAKE_REQUIRED_FLAGS "${SMT_MARCH_STR}")
+
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot v2, v0, v1, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S8)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vfwmadot v2, v0, v1, fp16\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFWMADOT_FP16)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i4\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S4)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot.hp v2, v0, v1, v0, 0, i8\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VFMADOT_S8)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vmadot1 v2, v0, v1\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOTN)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vpack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK)
+    check_c_source_compiles("int main() {__asm__ volatile(\"vnspack.vv v2, v0, v1, 2\");}" SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
+    unset(CMAKE_REQUIRED_FLAGS)
+
+    list(APPEND RISCV64_SPACEMIT_IME_SPEC "")
+    if (SPACEMIT_RISCV_COMPILER_SUPPORT_IME1)
+        set(RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME1")
+    endif()
+
+    if (SPACEMIT_RISCV_COMPILER_SUPPORT_VMADOT_S4 AND SPACEMIT_RISCV_COMPILER_SUPPORT_VPACK AND SPACEMIT_RISCV_COMPILER_SUPPORT_VNPACK)
+        list(APPEND RISCV64_SPACEMIT_IME_SPEC "RISCV64_SPACEMIT_IME2")
+    endif()
+
+    message("RISCV64_SPACEMIT_IME_SPEC: ${RISCV64_SPACEMIT_IME_SPEC}")
+endif()
@@ -50,6 +50,10 @@
 #include "llamafile/sgemm.h"
 #endif

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+#    include "spacemit/ime.h"
+#endif
+
 // Note: once we move threading into a separate C++ file
 // will use std::hardware_destructive_interference_size instead of hardcoding it here
 // and we'll use C++ attribute syntax.
@@ -3011,7 +3015,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    const struct ggml_cgraph * cgraph = tp->cgraph;
    const struct ggml_cplan  * cplan  = tp->cplan;

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+    ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(state->ith);
+#else
    set_numa_thread_affinity(state->ith);
+#endif

    struct ggml_compute_params params = {
        /*.ith        =*/ state->ith,
@@ -3068,6 +3076,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

    ggml_barrier(state->threadpool);

+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+    ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(state->ith);
+#endif
+
    return 0;
 }

@@ -8,6 +8,14 @@ extern "C" {

 ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);

+void ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(int thread_n);
+
+void ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(int thread_n);
+
+void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment);
+
+void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr);
+
 #ifdef __cplusplus
 }
 #endif
@@ -0,0 +1,320 @@
+#include "ime_env.h"
+
+#include "ggml-impl.h"
+#include "spine_mem_pool.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <cctype>
+#include <fstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+namespace ggml::cpu::riscv64_spacemit {
+bool spine_core_info::get_spine_core_info(std::vector<spine_core_info> & result) {
+    static std::unordered_map<uint64_t, spine_core_arch_id> spine_march_mapping_ = {
+        {0x8000000058000001,  spine_core_arch_id::core_arch_x60 },
+        { 0x8000000041000001, spine_core_arch_id::core_arch_a60 },
+        { 0x8000000058000002, spine_core_arch_id::core_arch_x100},
+        { 0x8000000041000002, spine_core_arch_id::core_arch_a100},
+    };
+
+    result.clear();
+    std::ifstream file("/proc/cpuinfo");
+    std::string   line;
+
+    std::vector<std::array<uint64_t, 2>> cpu_info_list;
+
+    uint64_t current_processor = spine_invalid_core_id;
+    uint64_t current_marchid   = 0;
+    bool     has_processor     = false;
+    bool     has_marchid       = false;
+
+    if (!file.is_open()) {
+        return false;
+    }
+
+    while (std::getline(file, line)) {
+        if (line.substr(0, 9) == "processor") {
+            if (has_processor && has_marchid) {
+                cpu_info_list.push_back({ current_processor, current_marchid });
+            }
+
+            size_t colon_pos = line.find(':');
+            if (colon_pos != std::string::npos) {
+                current_processor = std::stoi(line.substr(colon_pos + 1));
+                has_processor     = true;
+            }
+
+            has_marchid = false;
+        } else if (line.substr(0, 7) == "marchid") {
+            size_t colon_pos = line.find(':');
+            if (colon_pos != std::string::npos) {
+                std::string marchid_str = line.substr(colon_pos + 1);
+                marchid_str.erase(std::remove_if(marchid_str.begin(), marchid_str.end(), isspace), marchid_str.end());
+                current_marchid = std::stoull(marchid_str, nullptr, 16);
+                has_marchid     = true;
+            }
+        }
+    }
+
+    if (has_processor && has_marchid) {
+        cpu_info_list.push_back({ current_processor, current_marchid });
+    }
+
+    if (has_processor && has_marchid) {
+        for (auto & cpu_info : cpu_info_list) {
+            if (cpu_info[0] != spine_invalid_core_id &&
+                spine_march_mapping_.find(cpu_info[1]) != spine_march_mapping_.end()) {
+                auto core_info    = spine_core_info();
+                core_info.core_id = cpu_info[0];
+                core_info.arch_id = spine_core_arch_id(spine_march_mapping_[cpu_info[1]]);
+
+                result.push_back(core_info);
+            }
+        }
+    }
+
+    return has_processor && has_marchid;
+}
+
+namespace {
+uint16_t hex_string_to_u16(const std::string & hex_str) {
+    try {
+        size_t pos = 0;
+        if (hex_str.substr(0, 2) == "0x" || hex_str.substr(0, 2) == "0X") {
+            pos = 2;
+        }
+        unsigned long result = std::stoul(hex_str.substr(pos), nullptr, 16);
+        if (result > std::numeric_limits<uint16_t>::max()) {
+            throw std::out_of_range("Converted value is out of range for uint16_t");
+        }
+        return static_cast<uint16_t>(result);
+    } catch (const std::invalid_argument & e) {
+        throw std::invalid_argument("Invalid hexadecimal string");
+    } catch (const std::out_of_range & e) {
+        throw;
+    }
+}
+
+const char * spine_mem_pool_backend_to_string(spine_mem_pool_backend backend) {
+    switch (backend) {
+        case spine_mem_pool_backend::none:
+            return "NONE";
+        case spine_mem_pool_backend::posix_memalign:
+            return "POSIX";
+        case spine_mem_pool_backend::transparent_hugepage:
+            return "HPAGE";
+        case spine_mem_pool_backend::hugetlb_1g:
+            return "HPAGE1GB";
+    }
+
+    return "unknown";
+}
+
+spine_mem_pool_backend parse_mem_backend(const char * mem_backend_str) {
+    if (mem_backend_str == nullptr || mem_backend_str[0] == '\0') {
+        return spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    std::string value(mem_backend_str);
+    std::transform(value.begin(), value.end(), value.begin(),
+                   [](unsigned char ch) { return static_cast<char>(std::tolower(ch)); });
+
+    if (value == "none") {
+        return spine_mem_pool_backend::none;
+    }
+
+    if (value == "posix") {
+        return spine_mem_pool_backend::posix_memalign;
+    }
+
+    if (value == "hpage") {
+        return spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    if (value == "hpage1gb") {
+        return spine_mem_pool_backend::hugetlb_1g;
+    }
+
+    throw std::runtime_error("invalid SPACEMIT_MEM_BACKEND: " + value + ", expected NONE, POSIX, HPAGE or HPAGE1GB");
+}
+}  // namespace
+
+spine_env_info::spine_env_info() {
+    num_cores = static_cast<int>(std::thread::hardware_concurrency());
+    spine_core_info::get_spine_core_info(core_info_list);
+
+    // special for x60 K1
+    if (core_info_list.size() == 8 && core_info_list[0].arch_id == spine_core_arch_id::core_arch_x60) {
+        for (int i = 0; i < 4; i++) {
+            core_info_list[i].arch_id = spine_core_arch_id::core_arch_a60;
+        }
+    }
+
+    // special for qemu
+    if (core_info_list.size() == 0) {
+        char * spine_core_arch_str = getenv("SPACEMIT_CORE_ARCH");
+        if (spine_core_arch_str != nullptr) {
+            auto arch_id = hex_string_to_u16(spine_core_arch_str);
+            for (int i = 0; i < num_cores; i++) {
+                auto core_info    = spine_core_info();
+                core_info.core_id = i;
+                core_info.arch_id = spine_core_arch_id{ arch_id };
+                core_info_list.push_back(core_info);
+            }
+        }
+    }
+
+    if (core_info_list.size() == 0) {
+        throw std::runtime_error(
+            "Failed to get SPACEMIT_CORE_ARCH from environment or failed to parse it from /proc/cpuinfo");
+    }
+
+    char * spine_perfer_core_arch_str = getenv("SPACEMIT_PERFER_CORE_ARCH");
+    if (spine_perfer_core_arch_str != nullptr && spine_perfer_core_arch_str != "") {
+        perfer_core_arch_id = spine_core_arch_id{ hex_string_to_u16(spine_perfer_core_arch_str) };
+    }
+
+    char *           spine_perfer_core_id_str = getenv("SPACEMIT_PERFER_CORE_ID");
+    std::vector<int> perfer_core_id_vec;
+    if (spine_perfer_core_id_str != nullptr && spine_perfer_core_id_str != "") {
+        std::string perfer_core_id_str(spine_perfer_core_id_str);
+        size_t      start = 0;
+        size_t      end   = 0;
+        while ((end = perfer_core_id_str.find(',', start)) != std::string::npos) {
+            std::string core_id_substr = perfer_core_id_str.substr(start, end - start);
+            perfer_core_id_vec.push_back(std::stoi(core_id_substr));
+            start = end + 1;
+        }
+        std::string core_id_substr = perfer_core_id_str.substr(start);
+        perfer_core_id_vec.push_back(std::stoi(core_id_substr));
+    }
+
+    perfer_core_ids.reserve(num_cores);
+    if (perfer_core_arch_id == spine_core_arch_id::core_arch_none) {
+        for (auto & core_info : core_info_list) {
+            auto core_arch_id   = core_info.arch_id;
+            auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
+            if (core_arch_head == 0xA) {
+                num_perfer_cores++;
+                perfer_core_arch_id = core_arch_id;
+                cpu_mask |= (1ULL << core_info.core_id);
+                perfer_core_ids.push_back(core_info.core_id);
+            }
+        }
+    } else {
+        for (auto & core_info : core_info_list) {
+            auto core_arch_id = core_info.arch_id;
+            if (core_arch_id == perfer_core_arch_id) {
+                num_perfer_cores++;
+                cpu_mask |= (1ULL << core_info.core_id);
+
+                auto core_arch_head = (uint16_t) (core_arch_id) >> 12;
+                if (core_arch_head == 0xA) {
+                    perfer_core_ids.push_back(core_info.core_id);
+                }
+            }
+        }
+        if (num_perfer_cores == 0) {
+            GGML_ABORT("can not find core with arch id %x for SPACEMIT_PERFER_CORE_ARCH in core info list\n",
+                       (uint16_t) perfer_core_arch_id);
+        }
+    }
+
+    if (perfer_core_id_vec.size() > 0) {
+        perfer_core_ids.clear();
+        cpu_mask         = 0;
+        num_perfer_cores = 0;
+        for (int core_id : perfer_core_id_vec) {
+            if (core_id < 0 || core_id >= num_cores) {
+                GGML_ABORT("invalid core id in SPACEMIT_PERFER_CORE_ID: %d, should be between 0 and %d\n", core_id,
+                           num_cores - 1);
+            }
+            auto core_info    = core_info_list[core_id];
+            auto core_arch_id = core_info.arch_id;
+            if (core_arch_id == perfer_core_arch_id) {
+                cpu_mask |= (1ULL << core_id);
+                perfer_core_ids.push_back(core_id);
+            } else {
+                GGML_ABORT(
+                    "core id %d in SPACEMIT_PERFER_CORE_ID has arch id %x which does not match "
+                    "SPACEMIT_PERFER_CORE_ARCH %x\n",
+                    core_id, (uint16_t) core_arch_id, (uint16_t) perfer_core_arch_id);
+            }
+        }
+        std::string perfer_core_id_vec_str;
+        for (int core_id : perfer_core_id_vec) {
+            perfer_core_id_vec_str += std::to_string(core_id) + ",";
+        }
+        perfer_core_id_vec_str.pop_back();
+        GGML_LOG_DEBUG("SPACEMIT_PERFER_CORE_ID is set, perferred core ids: %s\n", perfer_core_id_vec_str.c_str());
+        num_perfer_cores = static_cast<int>(perfer_core_id_vec.size());
+    }
+
+    use_ime1 = perfer_core_arch_id == spine_core_arch_id::core_arch_a60 ||
+               perfer_core_arch_id == spine_core_arch_id::core_arch_x100;
+
+    use_ime2 = perfer_core_arch_id == spine_core_arch_id::core_arch_a100;
+
+    mem_backend                  = parse_mem_backend(getenv("SPACEMIT_MEM_BACKEND"));
+    char * spine_disable_tcm_str = getenv("SPACEMIT_DISABLE_TCM");
+    auto   user_disable_tcm      = spine_disable_tcm_str != nullptr && strcmp(spine_disable_tcm_str, "0") != 0;
+
+    if (!user_disable_tcm) {
+        spine_mem_pool_tcm_info tcm_info;
+        if (spine_mem_pool_tcm_init(&tcm_info)) {
+            use_tcm      = tcm_info.available;
+            tcm_blk_size = tcm_info.blk_size;
+            GGML_LOG_DEBUG("CPU_RISCV64_SPACEMIT: tcm is available, blk_size: %zu, blk_num: %zu, is_fake_tcm: %d\n",
+                           tcm_info.blk_size, tcm_info.blk_num, tcm_info.is_fake_tcm);
+
+            for (auto & core_info : core_info_list) {
+                auto core_arch_head = (uint16_t) (core_info.arch_id) >> 12;
+                if (core_arch_head != 0xA) {
+                    aicpu_id_offset++;
+                } else {
+                    break;
+                }
+            }
+        }
+    }
+
+    GGML_LOG_DEBUG(
+        "CPU_RISCV64_SPACEMIT: num_cores: %d, num_perfer_cores: %d, perfer_core_arch_id: %x, exclude_main_thread: %d, "
+        "use_ime1: %d, use_ime2: %d, mem_backend: %s, cpu_mask: %lx, aicpu_id_offset: %d\n",
+        num_cores, num_perfer_cores, (uint16_t) perfer_core_arch_id, exclude_main_thread, use_ime1, use_ime2,
+        spine_mem_pool_backend_to_string(mem_backend), cpu_mask, aicpu_id_offset);
+
+    const size_t init_barrier_size = sizeof(spine_barrier_t) * spine_init_barrier_count;
+    init_barrier =
+        static_cast<spine_barrier_t *>(spine_mem_pool_shared_mem_alloc(init_barrier_size, alignof(spine_barrier_t)));
+    if (init_barrier != nullptr) {
+        init_barrier_is_shared_mem = true;
+    } else {
+        GGML_LOG_WARN("CPU_RISCV64_SPACEMIT: failed to allocate init_barrier from shared mem, falling back to heap\n",
+                      __func__);
+        init_barrier = new spine_barrier_t[spine_init_barrier_count];
+    }
+
+    spine_barrier_init(init_barrier, spine_init_barrier_count, 2);
+}
+
+spine_env_info::~spine_env_info() {
+    if (init_barrier_is_shared_mem) {
+        spine_mem_pool_shared_mem_free(init_barrier);
+    } else {
+        delete[] init_barrier;
+    }
+
+    init_barrier               = nullptr;
+    init_barrier_is_shared_mem = false;
+}
+
+spine_env_info global_spine_env_info;
+
+}  // namespace ggml::cpu::riscv64_spacemit
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "spine_barrier.h"
+#include "spine_mem_pool.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace ggml::cpu::riscv64_spacemit {
+
+constexpr uint64_t spine_invalid_core_id    = 0xFFFFFFFF;
+constexpr size_t   spine_init_barrier_count = 16;
+
+enum class spine_core_arch_id : uint16_t {
+    core_arch_none = 0,
+    core_arch_x60  = 0x503C,
+    core_arch_x100 = 0x5064,
+    core_arch_x200 = 0x50C8,
+    core_arch_a60  = 0xA03C,
+    core_arch_a100 = 0xA064,
+    core_arch_a200 = 0xA0C8,
+};
+
+struct spine_core_info {
+    uint64_t           core_id{ spine_invalid_core_id };
+    spine_core_arch_id arch_id{ spine_core_arch_id::core_arch_none };
+
+    static bool get_spine_core_info(std::vector<spine_core_info> & result);
+};
+
+struct spine_env_info {
+    std::vector<spine_core_info> core_info_list;
+    std::vector<int>             perfer_core_ids;
+    int                          aicpu_id_offset{ 0 };
+    int                          num_cores{ 0 };
+    int                          num_perfer_cores{ 0 };
+    spine_core_arch_id           perfer_core_arch_id{ spine_core_arch_id::core_arch_none };
+    bool                         exclude_main_thread{ false };
+    bool                         use_ime2{ false };
+    bool                         use_ime1{ false };
+    bool                         use_tcm{ false };
+    spine_mem_pool_backend       mem_backend{ spine_mem_pool_backend::transparent_hugepage };
+    uint64_t                     tcm_blk_size{ 0 };
+    uint64_t                     cpu_mask{ 0 };
+    spine_barrier_t *            init_barrier{ nullptr };
+    bool                         init_barrier_is_shared_mem{ false };
+
+    spine_env_info();
+    ~spine_env_info();
+};
+
+extern spine_env_info global_spine_env_info;
+
+}  // namespace ggml::cpu::riscv64_spacemit
@@ -1,26 +1,189 @@
 #pragma once

+#include <cassert>
 #include <cstddef>
+#include <functional>
+
+namespace spacemit_kernels {
+
+#define BLOCK_QNK_LEN 256
+
+template <int N> struct nrow_block_q2_k {
+    // [4bit scale + 4bit zp] * N * 16
+    uint8_t  scales[N * BLOCK_QNK_LEN / 16];
+    // [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
+    // [b64, b80, b96, b112] ...[b79, b95, b111, b127]
+    // [b128, b144, b160, b176] ...[b143, b159, b175, b191]
+    // [b192, b208, b224, b240] ...[b207, b223, b239, b255]
+    uint8_t  qs[N * BLOCK_QNK_LEN / 4];
+    uint16_t scales16[N];
+    uint16_t zeros16[N];
+};
+
+template <int N> struct nrow_block_q3_k {
+    // [8bit scale] * N * 16
+    int8_t   scales[N * 16];
+    // [b0, b1, b2, b3, b4, b5, b6, b7] ... [b248, b249, b250, b251, b252, b253, b254, b255]
+    uint8_t  hmask[N * BLOCK_QNK_LEN / 8];
+    // [b0, b16, b32, b48] [b1, b17, b33, b49] ... [b15, b31, b47, b63]
+    // [b64, b80, b96, b112] ...[b79, b95, b111, b127]
+    // [b128, b144, b160, b176] ...[b143, b159, b175, b191]
+    // [b192, b208, b224, b240] ...[b207, b223, b239, b255]
+    uint8_t  qs[N * BLOCK_QNK_LEN / 4];
+    uint16_t scales16[N];
+};
+
+template <int N> struct nrow_block_mxfp4 {
+    uint8_t e[N];
+    uint8_t qh[4 * N];
+    uint8_t qs[16 * N];
+};
+
+template <int N> struct __attribute__((packed)) nrow_block_q5_1 {
+    uint16_t scales16[N];
+    uint8_t  zp[N];
+    // n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
+    uint8_t  qh[4 * N];
+    // n0 [b0, b1], [b2, b3] ....  [b30, b31]
+    // n1 [b0, b1], [b2, b3] ....  [b30, b31]
+    uint8_t  qs[16 * N];
+};
+
+static_assert(sizeof(nrow_block_q5_1<1>) == sizeof(uint8_t) + 22, "wrong nrow_block_q5_1 block size/padding");
+
+template <int N> struct __attribute__((packed)) nrow_block_q5_0 {
+    uint16_t scales16[N];
+    // n0 [bh0, bh1, bh2, bh3, bh4, bh5, bh6, bh7] ....
+    uint8_t  qh[4 * N];
+    // n0 [b0, b1], [b2, b3] ....  [b30, b31]
+    // n1 [b0, b1], [b2, b3] ....  [b30, b31]
+    uint8_t  qs[16 * N];
+};
+
+static_assert(sizeof(nrow_block_q5_0<1>) == 22, "wrong nrow_block_q5_0 block size/padding");
+
+using gemm_kernel_quantize_def = std::function<
+    size_t(size_t, const uint8_t *, const uint8_t *, const uint8_t *, float *, size_t, size_t, size_t, size_t)>;
+
+using moe_gemm_kernel_quantize_def = std::function<
+    size_t(size_t, const uint8_t **, const uint8_t *, const uint8_t *, float **, size_t, size_t, size_t, size_t)>;

-namespace sqnbitgemm_spacemit_ime {
 namespace ime1 {
-size_t gemm_kernel_i8i4(size_t            blk_len,
-                        const std::byte * quant_a_ptr,
-                        const std::byte * quant_b_data,
-                        const float *     quant_b_scale,
-                        const std::byte * quant_b_zp,
-                        float *           c_ptr,
-                        size_t            count_m,
-                        size_t            count_n,
-                        size_t            count_k,
-                        size_t            block_count_k,
-                        size_t            ldc,
-                        const float *     bias,
-                        const size_t      scale_stride);
+size_t gemm_kernel_i8i4(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);

-void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);

-void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
+void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);

 }  // namespace ime1
-}  // namespace sqnbitgemm_spacemit_ime
+
+namespace ime2 {
+size_t gemm_kernel_i8i2k(size_t          blk_len,
+                         const uint8_t * quant_a_ptr,
+                         const uint8_t * quant_b_data,
+                         const uint8_t * quant_b_zp,
+                         float *         c_ptr,
+                         size_t          count_m,
+                         size_t          count_n,
+                         size_t          k_blks,
+                         size_t          ldc);
+
+size_t gemm_kernel_i8i3k(size_t          blk_len,
+                         const uint8_t * quant_a_ptr,
+                         const uint8_t * quant_b_data,
+                         const uint8_t * quant_b_zp,
+                         float *         c_ptr,
+                         size_t          count_m,
+                         size_t          count_n,
+                         size_t          k_blks,
+                         size_t          ldc);
+
+size_t gemm_kernel_i8i4(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t gemm_kernel_i8i4_hp(size_t          blk_len,
+                           const uint8_t * quant_a_ptr,
+                           const uint8_t * quant_b_data,
+                           const uint8_t * quant_b_zp,
+                           float *         c_ptr,
+                           size_t          count_m,
+                           size_t          count_n,
+                           size_t          k_blks,
+                           size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8i4(size_t           blk_len,
+                               const uint8_t ** quant_a_ptr,
+                               const uint8_t *  quant_b_data,
+                               const uint8_t *  quant_b_zp,
+                               float **         c_ptr,
+                               size_t           count_m,
+                               size_t           count_n,
+                               size_t           k_blks,
+                               size_t           ldc);
+
+size_t gemm_kernel_i8i8(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t gemm_kernel_i8mxfp4(size_t          blk_len,
+                           const uint8_t * quant_a_ptr,
+                           const uint8_t * quant_b_data,
+                           const uint8_t * quant_b_zp,
+                           float *         c_ptr,
+                           size_t          count_m,
+                           size_t          count_n,
+                           size_t          k_blks,
+                           size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8mxfp4(size_t           blk_len,
+                                  const uint8_t ** quant_a_ptr,
+                                  const uint8_t *  quant_b_data,
+                                  const uint8_t *  quant_b_zp,
+                                  float **         c_ptr,
+                                  size_t           count_m,
+                                  size_t           count_n,
+                                  size_t           k_blks,
+                                  size_t           ldc);
+
+size_t gemm_kernel_i8i5(size_t          blk_len,
+                        const uint8_t * quant_a_ptr,
+                        const uint8_t * quant_b_data,
+                        const uint8_t * quant_b_zp,
+                        float *         c_ptr,
+                        size_t          count_m,
+                        size_t          count_n,
+                        size_t          k_blks,
+                        size_t          ldc);
+
+size_t moe_m2_gemm_kernel_i8i5(size_t           blk_len,
+                               const uint8_t ** quant_a_ptr,
+                               const uint8_t *  quant_b_data,
+                               const uint8_t *  quant_b_zp,
+                               float **         c_ptr,
+                               size_t           count_m,
+                               size_t           count_n,
+                               size_t           k_blks,
+                               size_t           ldc);
+}  // namespace ime2
+}  // namespace spacemit_kernels
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "ggml-common.h"
+#include "ggml.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggml::cpu::riscv64_spacemit {
+
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+int repack(ggml_tensor * t, const void * data, size_t data_size);
+
+}  // namespace ggml::cpu::riscv64_spacemit
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "ggml-cpu-impl.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+
+namespace spacemit_kernels {
+
+constexpr auto div_round_up(auto up, auto down) {
+    return (up + down - 1) / down;
+}
+
+// Q8 Blk [f32] [s16] [int8 * blk_len]
+// Q8 Blk N [f32 * N] [s16 * N] [int8 * blk_len * N]
+constexpr size_t q8_blk_size(size_t blk_len, bool with_blk_sum = false) {
+    const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + (with_blk_sum ? sizeof(int16_t) : 0);
+    return blk_size;
+}
+
+// Q8 HP row block: K is split into K32 subblocks.
+// Each subblock stores [f32 scale] [int8 * 32], with an optional fp16 sum trailer per subblock.
+constexpr size_t q8_hp_blk_size(size_t blk_len, bool with_blk_sum = false, bool with_blk_scale = false) {
+    const size_t subblk_count = div_round_up(blk_len, size_t(32));
+    const size_t blk_size     = blk_len * sizeof(int8_t) + subblk_count * sizeof(_Float16) +
+                            (with_blk_sum ? subblk_count * sizeof(_Float16) : 0) +
+                            (with_blk_scale ? sizeof(_Float16) : 0);
+    return blk_size;
+}
+
+// Q8K Blk [f32] [s16 * (blk_len / 16)] [int8 * blk_len]
+// Q8K Blk N [f32 * N] [s16 * (blk_len / 16) * N] [int8 * blk_len * N]
+constexpr size_t q8k_blk_size(size_t blk_len) {
+    const size_t blk_size = sizeof(float) + blk_len * sizeof(int8_t) + sizeof(int16_t) * blk_len / 16;
+    return blk_size;
+}
+
+using quantize_a_row_def = std::function<void(size_t, const float *, size_t, uint8_t *)>;
+
+namespace rvv {
+void memcpy1d(void * dst, const void * src, int64_t size);
+
+void memcpy2d(void * dst, int64_t dst_stride, const void * src, int64_t src_stride, int64_t tile_rows, int64_t size);
+
+void forward_flash_attn_ext_f16_one_chunk_vlen1024_vf16(const ggml_compute_params * params,
+                                                        ggml_tensor *               dst,
+                                                        int                         ir0,
+                                                        int                         ir1,
+                                                        void *                      tcm_buffer,
+                                                        size_t                      tcm_buffer_size);
+
+void forward_flash_attn_ext_f16_tiled_vlen1024_vf16(const ggml_compute_params * params,
+                                                    ggml_tensor *               dst,
+                                                    int                         ir0,
+                                                    int                         ir1,
+                                                    void *                      tcm_buffer,
+                                                    size_t                      tcm_buffer_size);
+
+void forward_rms_norm_f32(ggml_compute_params * params, ggml_tensor * op);
+
+void forward_norm_f32(ggml_compute_params * params, ggml_tensor * op);
+
+void forward_cont_with_permute(ggml_compute_params * params, ggml_tensor * op);
+
+void forward_cpy_with_permute(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_get_rows(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_concat(ggml_compute_params * params, ggml_tensor * op);
+
+template <ggml_op op_type, typename T> void forward_binary(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_sum_rows(const ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_repeat_nrows(ggml_compute_params * params, ggml_tensor * op);
+
+template <typename T> void forward_repeat_dim1(ggml_compute_params * params, ggml_tensor * op);
+
+void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_4row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+void quantize_a_4row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr);
+
+}  // namespace rvv
+
+}  // namespace spacemit_kernels
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+#define SPINE_CACHE_LINE  64
+#define SPINE_CACHE_ALIGN __attribute__((aligned(SPINE_CACHE_LINE)))
+
+struct spine_barrier_t {
+    SPINE_CACHE_ALIGN std::atomic<int64_t> pending_;
+    SPINE_CACHE_ALIGN std::atomic<int64_t> rounds_;
+    SPINE_CACHE_ALIGN int64_t              total_;
+};
+
+inline void spine_barrier_wait(spine_barrier_t * b) {
+    auto cur_round = b->rounds_.load(std::memory_order_acquire);
+    auto cnt       = --b->pending_;
+    if (cnt == 0) {
+        b->pending_.store(b->total_);
+        b->rounds_.store(cur_round + 1);
+    } else {
+        while (cur_round == b->rounds_.load(std::memory_order_relaxed)) {
+            __asm__ volatile("pause " ::: "memory");
+        }
+    }
+}
+
+inline void spine_barrier_init(spine_barrier_t * b, int num_barriers, uint64_t thread_count) {
+    for (int i = 0; i < num_barriers; i++) {
+        b[i].total_ = thread_count;
+        b[i].pending_.store(thread_count);
+        b[i].rounds_.store(0);
+    }
+}
@@ -0,0 +1,760 @@
+#include "spine_mem_pool.h"
+
+#include "common.h"
+#include "ime_env.h"
+#include "spine_tcm.h"
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace ggml::cpu::riscv64_spacemit {
+namespace {
+
+constexpr size_t   SPINE_MEM_POOL_CHUNK_SIZE         = 512ull * 1024ull * 1024ull;
+constexpr size_t   SPINE_SHARE_MEM_POOL_CHUNK_SIZE   = 512ull * 1024ull;
+constexpr size_t   SPINE_MEM_POOL_1G_REGION_SIZE     = 1ull << 30;
+constexpr uint64_t HUGETLB_1G_FLAG_REQUIRE_PUD       = 1ull << 0;
+constexpr char     SPINE_MEM_POOL_HUGETLB_1G_DEV[]   = "/dev/hugetlb_1g";
+constexpr char     SPINE_MEM_POOL_TCM_SYNC_MEM_DEV[] = "/dev/tcm_sync_mem";
+
+struct hugetlb_1g_region {
+    uint64_t size{ 0 };
+    uint64_t dma_addr{ 0 };
+    uint64_t flags{ 0 };
+    uint64_t reserved{ 0 };
+};
+
+#define HUGETLB_1G_IOC_MAGIC 'M'
+#define HUGETLB_1G_IOC_ALLOC _IOWR(HUGETLB_1G_IOC_MAGIC, 0x00, struct hugetlb_1g_region)
+#define HUGETLB_1G_IOC_FREE  _IO(HUGETLB_1G_IOC_MAGIC, 0x01)
+
+struct free_block {
+    size_t offset{ 0 };
+    size_t size{ 0 };
+};
+
+struct pool_chunk {
+    uint8_t *               base{ nullptr };
+    size_t                  size{ 0 };
+    int                     fd{ -1 };
+    std::vector<free_block> free_blocks;
+};
+
+struct pool_allocation {
+    void * chunk_base{ nullptr };
+    size_t chunk_size{ 0 };
+    void * base{ nullptr };
+    size_t size{ 0 };
+};
+
+bool is_power_of_two(size_t value) {
+    return value != 0 && (value & (value - 1)) == 0;
+}
+
+bool align_up(size_t value, size_t alignment, size_t * aligned_value) {
+    if (aligned_value == nullptr || alignment == 0) {
+        return false;
+    }
+
+    const size_t remainder = value % alignment;
+    if (remainder == 0) {
+        *aligned_value = value;
+        return true;
+    }
+
+    const size_t padding = alignment - remainder;
+    if (value > std::numeric_limits<size_t>::max() - padding) {
+        return false;
+    }
+
+    *aligned_value = value + padding;
+    return true;
+}
+
+bool align_up_uintptr(uintptr_t value, size_t alignment, uintptr_t * aligned_value) {
+    if (aligned_value == nullptr || alignment == 0) {
+        return false;
+    }
+
+    const uintptr_t remainder = value % alignment;
+    if (remainder == 0) {
+        *aligned_value = value;
+        return true;
+    }
+
+    const uintptr_t padding = alignment - remainder;
+    if (value > std::numeric_limits<uintptr_t>::max() - padding) {
+        return false;
+    }
+
+    *aligned_value = value + padding;
+    return true;
+}
+
+class spine_mem_pool_manager {
+  public:
+    explicit spine_mem_pool_manager(size_t default_chunk_size) : default_chunk_size_(default_chunk_size) {}
+
+    virtual ~spine_mem_pool_manager() = default;
+
+    void * alloc(size_t size, size_t alignment) {
+        if (size == 0 || !is_power_of_two(alignment)) {
+            return nullptr;
+        }
+
+        size_t aligned_size = 0;
+        if (!align_up(size, alignment, &aligned_size)) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: align_up failed for size %zu alignment %zu\n", __func__, size,
+                           alignment);
+            return nullptr;
+        }
+
+        pool_allocation allocation;
+
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
+            if (!add_chunk_locked(aligned_size, alignment)) {
+                return nullptr;
+            }
+
+            if (!try_alloc_locked(aligned_size, alignment, &allocation)) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation retry failed for size %zu alignment %zu\n",
+                               __func__, aligned_size, alignment);
+                return nullptr;
+            }
+        }
+
+        try {
+            const auto [allocation_it, inserted] = allocations_.emplace(allocation.base, allocation);
+            if (!inserted) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: duplicate allocation key %p\n", __func__, allocation.base);
+                rollback_allocation_locked(allocation);
+                return nullptr;
+            }
+        } catch (const std::bad_alloc &) {
+            rollback_allocation_locked(allocation);
+            throw;
+        }
+
+        return allocation.base;
+    }
+
+    void free(void * base) {
+        if (base == nullptr) {
+            return;
+        }
+
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        auto allocation_it = allocations_.find(base);
+        if (allocation_it == allocations_.end()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown allocation %p\n", __func__, base);
+            return;
+        }
+
+        pool_allocation allocation = allocation_it->second;
+        allocations_.erase(allocation_it);
+
+        auto chunk_it = find_chunk_locked(allocation);
+        if (chunk_it == chunks_.end()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: unknown chunk for allocation %p size %zu\n", __func__,
+                           allocation.base, allocation.size);
+            return;
+        }
+
+        auto * chunk_base = chunk_it->base;
+        auto * alloc_base = static_cast<uint8_t *>(allocation.base);
+        if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p out of chunk range %p..%p\n", __func__,
+                           allocation.base, chunk_base, chunk_base + chunk_it->size);
+            return;
+        }
+
+        const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
+        if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: allocation %p size %zu exceeds chunk size %zu\n", __func__,
+                           allocation.base, allocation.size, chunk_it->size);
+            return;
+        }
+
+        insert_free_block_locked(*chunk_it, { offset, allocation.size });
+        maybe_release_empty_chunk_locked(chunk_it);
+    }
+
+  protected:
+    void release_chunks() {
+        std::lock_guard<std::mutex> lock(mutex_);
+
+        allocations_.clear();
+        for (auto & chunk : chunks_) {
+            dealloc_chunk(&chunk);
+        }
+        chunks_.clear();
+    }
+
+    size_t default_chunk_size() const { return default_chunk_size_; }
+
+    static void clear_chunk(pool_chunk * chunk) {
+        chunk->base = nullptr;
+        chunk->size = 0;
+        chunk->fd   = -1;
+        chunk->free_blocks.clear();
+    }
+
+    virtual bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) = 0;
+    virtual void dealloc_chunk(pool_chunk * chunk)                                                    = 0;
+
+  private:
+    struct alloc_candidate {
+        size_t    chunk_index{ 0 };
+        size_t    block_index{ 0 };
+        size_t    aligned_offset{ 0 };
+        uintptr_t address{ std::numeric_limits<uintptr_t>::max() };
+        bool      valid{ false };
+    };
+
+    std::vector<pool_chunk>::iterator find_chunk_locked(const pool_allocation & allocation) {
+        return std::find_if(chunks_.begin(), chunks_.end(), [&](const pool_chunk & chunk) {
+            return chunk.base == allocation.chunk_base && chunk.size == allocation.chunk_size;
+        });
+    }
+
+    bool add_chunk_locked(size_t min_size, size_t alignment) {
+        pool_chunk   chunk;
+        const size_t chunk_request = default_chunk_size_ == 0 ? min_size : std::max(min_size, default_chunk_size_);
+        void *       hint_addr     = nullptr;
+
+        for (const auto & existing_chunk : chunks_) {
+            auto * chunk_end = existing_chunk.base + existing_chunk.size;
+            if (hint_addr == nullptr || chunk_end > hint_addr) {
+                hint_addr = chunk_end;
+            }
+        }
+
+        if (!alloc_chunk(chunk_request, alignment, hint_addr, &chunk)) {
+            return false;
+        }
+
+        if (chunk.base == nullptr || chunk.size < min_size) {
+            GGML_LOG_ERROR(
+                "CPU_RISCV64_SPACEMIT: %s: invalid chunk returned for request size %zu, chunk_base=%p chunk_size=%zu\n",
+                __func__, min_size, chunk.base, chunk.size);
+            dealloc_chunk(&chunk);
+            return false;
+        }
+
+        try {
+            chunk.free_blocks.push_back({ 0, chunk.size });
+            chunks_.push_back(std::move(chunk));
+        } catch (const std::bad_alloc &) {
+            dealloc_chunk(&chunk);
+            throw;
+        }
+
+        return true;
+    }
+
+    void rollback_allocation_locked(const pool_allocation & allocation) {
+        auto chunk_it = find_chunk_locked(allocation);
+        if (chunk_it == chunks_.end()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, owning chunk not found\n",
+                           __func__, allocation.base);
+            return;
+        }
+
+        auto * chunk_base = chunk_it->base;
+        auto * alloc_base = static_cast<uint8_t *>(allocation.base);
+        if (alloc_base < chunk_base || alloc_base >= chunk_base + chunk_it->size) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p, chunk range is invalid\n",
+                           __func__, allocation.base);
+            return;
+        }
+
+        const size_t offset = static_cast<size_t>(alloc_base - chunk_base);
+        if (offset > chunk_it->size || allocation.size > chunk_it->size - offset) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to rollback allocation %p size %zu\n", __func__,
+                           allocation.base, allocation.size);
+            return;
+        }
+
+        insert_free_block_locked(*chunk_it, { offset, allocation.size });
+        maybe_release_empty_chunk_locked(chunk_it);
+    }
+
+    bool try_alloc_locked(size_t size, size_t alignment, pool_allocation * allocation) {
+        alloc_candidate best;
+
+        for (size_t chunk_index = 0; chunk_index < chunks_.size(); ++chunk_index) {
+            const auto & chunk = chunks_[chunk_index];
+            for (size_t block_index = 0; block_index < chunk.free_blocks.size(); ++block_index) {
+                const auto & block = chunk.free_blocks[block_index];
+
+                uintptr_t  aligned_addr = 0;
+                const auto block_addr   = reinterpret_cast<uintptr_t>(chunk.base + block.offset);
+                if (!align_up_uintptr(block_addr, alignment, &aligned_addr)) {
+                    continue;
+                }
+
+                if (aligned_addr < block_addr) {
+                    continue;
+                }
+
+                const size_t aligned_offset = block.offset + static_cast<size_t>(aligned_addr - block_addr);
+                const size_t padding        = aligned_offset - block.offset;
+                if (padding > block.size || size > block.size - padding) {
+                    continue;
+                }
+
+                if (!best.valid || aligned_addr < best.address) {
+                    best.chunk_index    = chunk_index;
+                    best.block_index    = block_index;
+                    best.aligned_offset = aligned_offset;
+                    best.address        = aligned_addr;
+                    best.valid          = true;
+                }
+            }
+        }
+
+        if (!best.valid) {
+            return false;
+        }
+
+        auto &           chunk     = chunks_[best.chunk_index];
+        const free_block block     = chunk.free_blocks[best.block_index];
+        const size_t     padding   = best.aligned_offset - block.offset;
+        const size_t     alloc_end = best.aligned_offset + size;
+        const size_t     block_end = block.offset + block.size;
+
+        chunk.free_blocks.erase(chunk.free_blocks.begin() + best.block_index);
+        auto insert_it = chunk.free_blocks.begin() + best.block_index;
+        if (padding != 0) {
+            insert_it = chunk.free_blocks.insert(insert_it, { block.offset, padding });
+            ++insert_it;
+        }
+        if (alloc_end < block_end) {
+            chunk.free_blocks.insert(insert_it, { alloc_end, block_end - alloc_end });
+        }
+
+        allocation->chunk_base = chunk.base;
+        allocation->chunk_size = chunk.size;
+        allocation->base       = chunk.base + best.aligned_offset;
+        allocation->size       = size;
+        return true;
+    }
+
+    void maybe_release_empty_chunk_locked(std::vector<pool_chunk>::iterator chunk_it) {
+        if (chunk_it->free_blocks.size() != 1) {
+            return;
+        }
+
+        const auto & block = chunk_it->free_blocks.front();
+        if (block.offset != 0 || block.size != chunk_it->size) {
+            return;
+        }
+
+        dealloc_chunk(&*chunk_it);
+        chunks_.erase(chunk_it);
+    }
+
+    void insert_free_block_locked(pool_chunk & chunk, free_block block) {
+        auto it = chunk.free_blocks.begin();
+        while (it != chunk.free_blocks.end() && it->offset < block.offset) {
+            ++it;
+        }
+
+        if (it != chunk.free_blocks.begin()) {
+            const auto & prev = *(it - 1);
+            if (prev.offset + prev.size > block.offset) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping free block at offset %zu size %zu\n", __func__,
+                               block.offset, block.size);
+                return;
+            }
+        }
+
+        if (it != chunk.free_blocks.end() && block.offset + block.size > it->offset) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: overlapping next free block at offset %zu size %zu\n", __func__,
+                           block.offset, block.size);
+            return;
+        }
+
+        it = chunk.free_blocks.insert(it, block);
+
+        if (it != chunk.free_blocks.begin()) {
+            auto prev = it - 1;
+            if (prev->offset + prev->size == it->offset) {
+                it->offset = prev->offset;
+                it->size += prev->size;
+                it = chunk.free_blocks.erase(prev);
+            }
+        }
+
+        if (it + 1 != chunk.free_blocks.end() && it->offset + it->size == (it + 1)->offset) {
+            it->size += (it + 1)->size;
+            chunk.free_blocks.erase(it + 1);
+        }
+    }
+
+    std::mutex                                  mutex_;
+    std::vector<pool_chunk>                     chunks_;
+    std::unordered_map<void *, pool_allocation> allocations_;
+    size_t                                      default_chunk_size_{ 0 };
+};
+
+class spine_mem_pool_posix final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_posix() : spine_mem_pool_manager(0) {}
+
+    ~spine_mem_pool_posix() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) hint_addr;
+
+        const size_t alloc_alignment = std::max(alignment, sizeof(void *));
+        void *       base            = nullptr;
+        const int    rc              = posix_memalign(&base, alloc_alignment, min_size);
+        if (rc != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: posix_memalign failed for size %zu alignment %zu, rc=%d\n",
+                           __func__, min_size, alloc_alignment, rc);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(base);
+        chunk->size = min_size;
+        chunk->fd   = -1;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        std::free(chunk->base);
+        clear_chunk(chunk);
+    }
+};
+
+class spine_mem_pool_transparent_hugepage final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_transparent_hugepage() : spine_mem_pool_manager(SPINE_MEM_POOL_CHUNK_SIZE) {}
+
+    ~spine_mem_pool_transparent_hugepage() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) alignment;
+
+        size_t chunk_size = 0;
+        if (!align_up(min_size, default_chunk_size(), &chunk_size)) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round chunk size for %zu\n", __func__, min_size);
+            return false;
+        }
+
+        void * map_addr = mmap(hint_addr, chunk_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (map_addr == MAP_FAILED) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for chunk size %zu, errno=%d\n", __func__, chunk_size,
+                           errno);
+            return false;
+        }
+
+        if (madvise(map_addr, chunk_size, MADV_HUGEPAGE) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: madvise(MADV_HUGEPAGE) failed for chunk size %zu, errno=%d\n",
+                           __func__, chunk_size, errno);
+            munmap(map_addr, chunk_size);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(map_addr);
+        chunk->size = chunk_size;
+        chunk->fd   = -1;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for chunk %p size %zu, errno=%d\n", __func__,
+                           chunk->base, chunk->size, errno);
+        }
+
+        clear_chunk(chunk);
+    }
+};
+
+class spine_mem_pool_hugetlb_1g final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_hugetlb_1g() : spine_mem_pool_manager(SPINE_MEM_POOL_1G_REGION_SIZE) {}
+
+    ~spine_mem_pool_hugetlb_1g() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) alignment;
+        (void) hint_addr;
+
+        size_t region_size = 0;
+        if (!align_up(min_size, SPINE_MEM_POOL_1G_REGION_SIZE, &region_size)) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to round hugetlb_1g size for %zu\n", __func__, min_size);
+            return false;
+        }
+
+        const int fd = open(SPINE_MEM_POOL_HUGETLB_1G_DEV, O_RDWR);
+        if (fd < 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
+                           SPINE_MEM_POOL_HUGETLB_1G_DEV, errno);
+            return false;
+        }
+
+        hugetlb_1g_region region;
+        region.size  = region_size;
+        region.flags = HUGETLB_1G_FLAG_REQUIRE_PUD;
+        if (ioctl(fd, HUGETLB_1G_IOC_ALLOC, &region) < 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_ALLOC failed for size %zu, errno=%d\n", __func__,
+                           region_size, errno);
+            close(fd);
+            return false;
+        }
+
+        void * map_addr = mmap(nullptr, region.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (map_addr == MAP_FAILED) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for hugetlb_1g size %llu, errno=%d\n", __func__,
+                           static_cast<unsigned long long>(region.size), errno);
+            ioctl(fd, HUGETLB_1G_IOC_FREE);
+            close(fd);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(map_addr);
+        chunk->size = region.size;
+        chunk->fd   = fd;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for hugetlb_1g chunk %p size %zu, errno=%d\n",
+                           __func__, chunk->base, chunk->size, errno);
+        }
+
+        if (chunk->fd >= 0) {
+            if (ioctl(chunk->fd, HUGETLB_1G_IOC_FREE) < 0) {
+                GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: HUGETLB_1G_IOC_FREE failed for chunk %p, errno=%d\n",
+                               __func__, chunk->base, errno);
+            }
+
+            close(chunk->fd);
+        }
+
+        clear_chunk(chunk);
+    }
+};
+
+class spine_mem_pool_shared_mem final : public spine_mem_pool_manager {
+  public:
+    spine_mem_pool_shared_mem() : spine_mem_pool_manager(SPINE_SHARE_MEM_POOL_CHUNK_SIZE) {}
+
+    ~spine_mem_pool_shared_mem() override { release_chunks(); }
+
+  private:
+    bool alloc_chunk(size_t min_size, size_t alignment, void * hint_addr, pool_chunk * chunk) override {
+        (void) alignment;
+
+        if (hint_addr != nullptr) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem does not support multiple active chunks\n", __func__);
+            return false;
+        }
+
+        if (min_size > default_chunk_size()) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: shared_mem request %zu exceeds chunk size %zu\n", __func__,
+                           min_size, default_chunk_size());
+            return false;
+        }
+
+        const int fd = open(SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, O_RDWR | O_SYNC);
+        if (fd < 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: open(%s) failed, errno=%d\n", __func__,
+                           SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, errno);
+            return false;
+        }
+
+        void * map_addr = mmap(nullptr, default_chunk_size(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (map_addr == MAP_FAILED) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: mmap failed for %s size %zu, errno=%d\n", __func__,
+                           SPINE_MEM_POOL_TCM_SYNC_MEM_DEV, default_chunk_size(), errno);
+            close(fd);
+            return false;
+        }
+
+        chunk->base = static_cast<uint8_t *>(map_addr);
+        chunk->size = default_chunk_size();
+        chunk->fd   = fd;
+        return true;
+    }
+
+    void dealloc_chunk(pool_chunk * chunk) override {
+        if (chunk->base != nullptr && chunk->size != 0 && munmap(chunk->base, chunk->size) != 0) {
+            GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: munmap failed for shared_mem chunk %p size %zu, errno=%d\n",
+                           __func__, chunk->base, chunk->size, errno);
+        }
+
+        if (chunk->fd >= 0) {
+            close(chunk->fd);
+        }
+
+        clear_chunk(chunk);
+    }
+};
+
+spine_mem_pool_manager & get_spine_mem_pool_manager() {
+    static std::once_flag                          pool_once;
+    static std::unique_ptr<spine_mem_pool_manager> selected_pool;
+    static spine_mem_pool_backend                  selected_backend = spine_mem_pool_backend::none;
+
+    spine_mem_pool_backend backend = global_spine_env_info.mem_backend;
+    if (backend == spine_mem_pool_backend::none) {
+        backend = spine_mem_pool_backend::transparent_hugepage;
+    }
+
+    std::call_once(pool_once, [&]() {
+        selected_backend = backend;
+
+        switch (selected_backend) {
+            case spine_mem_pool_backend::posix_memalign:
+                selected_pool = std::make_unique<spine_mem_pool_posix>();
+                break;
+            case spine_mem_pool_backend::transparent_hugepage:
+                selected_pool = std::make_unique<spine_mem_pool_transparent_hugepage>();
+                break;
+            case spine_mem_pool_backend::hugetlb_1g:
+                selected_pool = std::make_unique<spine_mem_pool_hugetlb_1g>();
+                break;
+            case spine_mem_pool_backend::none:
+                selected_backend = spine_mem_pool_backend::transparent_hugepage;
+                selected_pool    = std::make_unique<spine_mem_pool_transparent_hugepage>();
+                break;
+        }
+    });
+
+    if (backend != selected_backend) {
+        GGML_LOG_ERROR(
+            "CPU_RISCV64_SPACEMIT: %s: mem pool backend is process-global and mutually exclusive, requested=%d but "
+            "selected=%d\n",
+            __func__, static_cast<int>(backend), static_cast<int>(selected_backend));
+    }
+
+    if (selected_pool) {
+        return *selected_pool;
+    }
+
+    throw std::bad_alloc();
+}
+
+spine_mem_pool_manager & get_spine_mem_pool_shared_mem_manager() {
+    static std::once_flag                             shared_mem_pool_once;
+    static std::unique_ptr<spine_mem_pool_shared_mem> shared_mem_pool;
+
+    std::call_once(shared_mem_pool_once, [&]() { shared_mem_pool = std::make_unique<spine_mem_pool_shared_mem>(); });
+
+    if (shared_mem_pool) {
+        return *shared_mem_pool;
+    }
+
+    throw std::bad_alloc();
+}
+
+}  // namespace
+
+bool spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept {
+    if (info == nullptr) {
+        return false;
+    }
+
+    *info = {};
+
+    if (spine_tcm_open_handle(NULL) != 0 || !spine_tcm_is_available()) {
+        return false;
+    }
+
+    spine_tcm_mem_info_t mem_info;
+    if (spine_tcm_mem_info(&mem_info) != 0) {
+        return false;
+    }
+
+    info->available   = true;
+    info->blk_size    = mem_info.blk_size;
+    info->blk_num     = mem_info.blk_num;
+    info->is_fake_tcm = mem_info.is_fake_tcm != 0;
+    return true;
+}
+
+void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept {
+    return spine_tcm_mem_get(cpu_id);
+}
+
+void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept {
+    return spine_tcm_mem_try_wait(cpu_id, 1000 * 1000);
+}
+
+int spine_mem_pool_tcm_mem_release(int cpu_id) noexcept {
+    return spine_tcm_mem_release(cpu_id);
+}
+
+void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept {
+    try {
+        return get_spine_mem_pool_manager().alloc(size, alignment);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating size %zu\n", __func__, size);
+        return nullptr;
+    }
+}
+
+void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept {
+    try {
+        return get_spine_mem_pool_shared_mem_manager().alloc(size, alignment);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while allocating shared memory size %zu\n", __func__, size);
+        return nullptr;
+    }
+}
+
+void spine_mem_pool_free(void * base) noexcept {
+    try {
+        get_spine_mem_pool_manager().free(base);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing allocation %p\n", __func__, base);
+    }
+}
+
+void spine_mem_pool_shared_mem_free(void * base) noexcept {
+    try {
+        get_spine_mem_pool_shared_mem_manager().free(base);
+    } catch (const std::bad_alloc &) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: bad_alloc while freeing shared allocation %p\n", __func__, base);
+    }
+}
+
+}  // namespace ggml::cpu::riscv64_spacemit
+
+extern "C" {
+void * ggml_backend_cpu_riscv64_spacemit_alloc_shared(size_t size, size_t alignment) {
+    void * result = ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_alloc(size, alignment);
+    if (result == nullptr) {
+        GGML_LOG_ERROR("CPU_RISCV64_SPACEMIT: %s: failed to allocate shared memory size %zu alignment %zu\n", __func__,
+                       size, alignment);
+    }
+    return result;
+}
+
+void ggml_backend_cpu_riscv64_spacemit_free_shared(void * ptr) {
+    ggml::cpu::riscv64_spacemit::spine_mem_pool_shared_mem_free(ptr);
+}
+}
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ggml::cpu::riscv64_spacemit {
+
+enum class spine_mem_pool_backend : uint8_t {
+    none,
+    posix_memalign,
+    transparent_hugepage,
+    hugetlb_1g,
+};
+
+struct spine_mem_pool_tcm_info {
+    bool   available{ false };
+    size_t blk_size{ 0 };
+    size_t blk_num{ 0 };
+    bool   is_fake_tcm{ false };
+};
+
+bool   spine_mem_pool_tcm_init(spine_mem_pool_tcm_info * info) noexcept;
+void * spine_mem_pool_tcm_mem_get(int cpu_id) noexcept;
+void * spine_mem_pool_tcm_mem_wait(int cpu_id) noexcept;
+int    spine_mem_pool_tcm_mem_release(int cpu_id) noexcept;
+
+void * spine_mem_pool_alloc(size_t size, size_t alignment) noexcept;
+void * spine_mem_pool_shared_mem_alloc(size_t size, size_t alignment) noexcept;
+void   spine_mem_pool_free(void * base) noexcept;
+void   spine_mem_pool_shared_mem_free(void * base) noexcept;
+
+}  // namespace ggml::cpu::riscv64_spacemit
@@ -0,0 +1,409 @@
+#ifndef SPINE_TCM_PUBLIC_H_
+#define SPINE_TCM_PUBLIC_H_
+
+/*
+ * spine_tcm public API
+ *
+ * Usage:
+ *   1. Direct link mode
+ *      Define SPINE_TCM_DIRECT_LINK and link against libspine_tcm.so.
+ *
+ *      if (spine_tcm_is_available()) {
+ *          void *buffer = spine_tcm_mem_get(0);
+ *          spine_tcm_mem_free(0);
+ *      }
+ *
+ *   2. Header-only loader mode
+ *      Include this header without linking libspine_tcm.so. The loader first
+ *      tries to reuse a process-global spine_tcm instance and falls back to
+ *      dlopen("libspine_tcm.so") when needed.
+ *
+ *      spine_tcm_open_handle(NULL);  // optional pre-bind
+ *      if (spine_tcm_is_available()) {
+ *          void *buffer = spine_tcm_mem_get(0);
+ *          spine_tcm_mem_free(0);
+ *      }
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !defined(SPINE_TCM_BUILD_SHARED) && !defined(SPINE_TCM_DIRECT_LINK)
+#    include <dlfcn.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+#    if defined(SPINE_TCM_BUILD_SHARED)
+#        define SPINE_TCM_API __declspec(dllexport)
+#    else
+#        define SPINE_TCM_API __declspec(dllimport)
+#    endif
+#else
+#    define SPINE_TCM_API __attribute__((visibility("default")))
+#endif
+
+typedef struct spine_tcm_mem_info {
+    size_t blk_size;
+    size_t blk_num;
+    int    is_fake_tcm;
+} spine_tcm_mem_info_t;
+
+typedef struct spine_tcm_block_info {
+    int      id;
+    void *   va;
+    size_t   size;
+    uint64_t phys_addr;
+    uint64_t cpu_affinity_mask;
+    int      owner_tid;
+    int      is_acquired;
+} spine_tcm_block_info_t;
+
+/* Shared-library runtime ABI exported by libspine_tcm.so. */
+SPINE_TCM_API const char * spine_tcm_runtime_version(void);
+SPINE_TCM_API int          spine_tcm_runtime_is_available(void);
+SPINE_TCM_API int          spine_tcm_runtime_layout_info(spine_tcm_mem_info_t * info);
+SPINE_TCM_API int          spine_tcm_runtime_mem_info(int id, spine_tcm_block_info_t * info);
+SPINE_TCM_API void *       spine_tcm_runtime_mem_get(int id);
+SPINE_TCM_API int          spine_tcm_runtime_mem_free(int id);
+SPINE_TCM_API void *       spine_tcm_runtime_mem_try_wait(int id, size_t timeout_us);
+SPINE_TCM_API int          spine_tcm_runtime_mem_release(int id);
+SPINE_TCM_API int          spine_tcm_runtime_mem_force_release(int id);
+SPINE_TCM_API int          spine_tcm_runtime_mem_query(int id);
+
+#if defined(SPINE_TCM_DIRECT_LINK)
+/* Optional no-op in direct-link mode. */
+static inline int spine_tcm_open_handle(const char * so_path) {
+    (void) so_path;
+    return 0;
+}
+
+static inline const char * spine_tcm_version(void) {
+    return spine_tcm_runtime_version();
+}
+
+/* Returns 1 when the runtime driver is available, otherwise 0. */
+static inline int spine_tcm_is_available(void) {
+    return spine_tcm_runtime_is_available();
+}
+
+/* Returns runtime memory geometry and whether the current backend is fake TCM. */
+static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
+    return spine_tcm_runtime_layout_info(info);
+}
+
+/* Returns per-block runtime metadata for the given TCM id. */
+static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
+    return spine_tcm_runtime_mem_info(id, info);
+}
+
+/* Returns a cached buffer for the given TCM id, or NULL on failure. */
+static inline void * spine_tcm_mem_get(int id) {
+    return spine_tcm_runtime_mem_get(id);
+}
+
+/* Releases one reference acquired by spine_tcm_mem_get(id). */
+static inline int spine_tcm_mem_free(int id) {
+    return spine_tcm_runtime_mem_free(id);
+}
+
+/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
+static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
+    return spine_tcm_runtime_mem_try_wait(id, over_time);
+}
+
+/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
+static inline int spine_tcm_mem_release(int id) {
+    return spine_tcm_runtime_mem_release(id);
+}
+
+/* Forces a release for the given TCM id when the backend supports it. */
+static inline int spine_tcm_mem_force_release(int id) {
+    return spine_tcm_runtime_mem_force_release(id);
+}
+
+/* Returns whether the given TCM id is currently acquired. */
+static inline int spine_tcm_mem_query(int id) {
+    return spine_tcm_runtime_mem_query(id);
+}
+#elif !defined(SPINE_TCM_BUILD_SHARED)
+typedef struct spine_tcm_handle {
+    void * module_handle;
+    int    use_global_scope;
+    int    owns_module_handle;
+    const char * (*runtime_version)(void);
+    int (*runtime_is_available)(void);
+    int (*runtime_layout_info)(spine_tcm_mem_info_t * info);
+    int (*runtime_mem_info)(int id, spine_tcm_block_info_t * info);
+    void * (*runtime_mem_get)(int id);
+    int (*runtime_mem_free)(int id);
+    void * (*runtime_mem_try_wait)(int id, size_t over_time);
+    int (*runtime_mem_release)(int id);
+    int (*runtime_mem_force_release)(int id);
+    int (*runtime_mem_query)(int id);
+} spine_tcm_handle_t;
+
+static inline spine_tcm_handle_t * spine_tcm_default_handle(void) {
+    static spine_tcm_handle_t handle = { 0 };
+    return &handle;
+}
+
+static inline void spine_tcm_handle_reset(spine_tcm_handle_t * handle) {
+    if (handle != NULL) {
+        memset(handle, 0, sizeof(*handle));
+    }
+}
+
+static inline int spine_tcm_handle_bind(spine_tcm_handle_t * handle) {
+    void * symbol_scope = handle->use_global_scope ? RTLD_DEFAULT : handle->module_handle;
+
+    handle->runtime_version      = (const char * (*) (void) ) dlsym(symbol_scope, "spine_tcm_runtime_version");
+    handle->runtime_is_available = (int (*)(void)) dlsym(symbol_scope, "spine_tcm_runtime_is_available");
+    handle->runtime_layout_info =
+        (int (*)(spine_tcm_mem_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_layout_info");
+    handle->runtime_mem_info =
+        (int (*)(int, spine_tcm_block_info_t *)) dlsym(symbol_scope, "spine_tcm_runtime_mem_info");
+    handle->runtime_mem_get      = (void * (*) (int) ) dlsym(symbol_scope, "spine_tcm_runtime_mem_get");
+    handle->runtime_mem_free     = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_free");
+    handle->runtime_mem_try_wait = (void * (*) (int, size_t)) dlsym(symbol_scope, "spine_tcm_runtime_mem_try_wait");
+    handle->runtime_mem_release  = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_release");
+    handle->runtime_mem_force_release = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_force_release");
+    handle->runtime_mem_query         = (int (*)(int)) dlsym(symbol_scope, "spine_tcm_runtime_mem_query");
+
+    return handle->runtime_version != NULL && handle->runtime_is_available != NULL &&
+                   handle->runtime_layout_info != NULL && handle->runtime_mem_info != NULL &&
+                   handle->runtime_mem_get != NULL && handle->runtime_mem_free != NULL &&
+                   handle->runtime_mem_try_wait != NULL && handle->runtime_mem_release != NULL &&
+                   handle->runtime_mem_force_release != NULL && handle->runtime_mem_query != NULL ?
+               0 :
+               -1;
+}
+
+/*
+ * Try to bind against an already-loaded process-global spine_tcm instance.
+ * The shared library exports spine_tcm_runtime_marker only for this probe.
+ */
+static inline int spine_tcm_try_bind_global(spine_tcm_handle_t * handle) {
+    if (dlsym(RTLD_DEFAULT, "spine_tcm_runtime_marker") == NULL) {
+        return -1;
+    }
+
+    handle->use_global_scope = 1;
+    return spine_tcm_handle_bind(handle);
+}
+
+/*
+ * Optional pre-bind entry point.
+ *
+ * Behavior:
+ *   - Reuses an already-loaded global spine_tcm instance when available.
+ *   - Otherwise loads the shared library from so_path or the default soname.
+ *   - Repeated calls are safe and return 0 after the first successful bind.
+ */
+static inline int spine_tcm_open_handle(const char * so_path) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+    const char *         library  = (so_path != NULL && so_path[0] != '\0') ? so_path : "libspine_tcm.so";
+
+    if (resolved->module_handle != NULL || resolved->use_global_scope) {
+        return 0;
+    }
+
+    if (spine_tcm_try_bind_global(resolved) == 0) {
+        return 0;
+    }
+
+    spine_tcm_handle_reset(resolved);
+
+    resolved->module_handle      = dlopen(library, RTLD_LAZY | RTLD_GLOBAL);
+    resolved->owns_module_handle = resolved->module_handle != NULL ? 1 : 0;
+
+    if (resolved->module_handle == NULL) {
+        spine_tcm_handle_reset(resolved);
+        return -1;
+    }
+
+    if (spine_tcm_handle_bind(resolved) != 0) {
+        if (resolved->owns_module_handle) {
+            dlclose(resolved->module_handle);
+        }
+        spine_tcm_handle_reset(resolved);
+        return -1;
+    }
+
+    return 0;
+}
+
+/* Returns 1 when the runtime driver is available, otherwise 0. */
+static inline int spine_tcm_is_available(void) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_is_available == NULL) {
+        return 0;
+    }
+
+    return resolved->runtime_is_available();
+}
+
+/* Returns runtime memory geometry and whether the current backend is fake TCM. */
+static inline int spine_tcm_mem_info(spine_tcm_mem_info_t * info) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_layout_info == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_layout_info(info);
+}
+
+static inline const char * spine_tcm_version(void) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_version == NULL) {
+        return "unknown";
+    }
+
+    return resolved->runtime_version();
+}
+
+/* Returns per-block runtime metadata for the given TCM id. */
+static inline int spine_tcm_block_info(int id, spine_tcm_block_info_t * info) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_info == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_info(id, info);
+}
+
+/* Returns a cached buffer for the given TCM id, or NULL on failure. */
+static inline void * spine_tcm_mem_get(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        return NULL;
+    }
+
+    if (resolved->runtime_mem_get == NULL) {
+        return NULL;
+    }
+
+    return resolved->runtime_mem_get(id);
+}
+
+/* Releases one reference acquired by spine_tcm_mem_get(id). */
+static inline int spine_tcm_mem_free(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_free == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_free(id);
+}
+
+/* Waits for a TCM block handoff and returns the driver-owned buffer when available. */
+static inline void * spine_tcm_mem_try_wait(int id, size_t over_time) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        return NULL;
+    }
+
+    if (resolved->runtime_mem_try_wait == NULL) {
+        return NULL;
+    }
+
+    return resolved->runtime_mem_try_wait(id, over_time);
+}
+
+/* Releases a buffer acquired by spine_tcm_mem_try_wait(id, over_time). */
+static inline int spine_tcm_mem_release(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_release == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_release(id);
+}
+
+/* Forces a release for the given TCM id when the backend supports it. */
+static inline int spine_tcm_mem_force_release(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) ||
+        resolved->runtime_mem_force_release == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_force_release(id);
+}
+
+/* Returns whether the given TCM id is currently acquired. */
+static inline int spine_tcm_mem_query(int id) {
+    spine_tcm_handle_t * resolved = spine_tcm_default_handle();
+
+    if (resolved->module_handle == NULL && !resolved->use_global_scope) {
+        (void) spine_tcm_open_handle(NULL);
+    }
+
+    if ((resolved->module_handle == NULL && !resolved->use_global_scope) || resolved->runtime_mem_query == NULL) {
+        return -1;
+    }
+
+    return resolved->runtime_mem_query(id);
+}
+#else
+static inline const char * spine_tcm_version(void) {
+    return spine_tcm_runtime_version();
+}
+#endif
+
+#define SPINE_TCM_VERSION (spine_tcm_version())
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif