forked from wylab/llama.cpp
Compare commits
19 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 482211438d | |||
| 7bed317f53 | |||
| dcb7d17758 | |||
| 51604435e8 | |||
| 17158965ac | |||
| 12280ae905 | |||
| 54a0fee4b7 | |||
| dada4c846d | |||
| b8ee22cfde | |||
| 2eaa2c65cb | |||
| c33a58bced | |||
| a81a569577 | |||
| 53ecd4fdb9 | |||
| c6f6e4f96a | |||
| d9f8f60618 | |||
| e4ae383317 | |||
| 34ce48d97a | |||
| 45e350e3d3 | |||
| c6b2c9310c |
@@ -1770,7 +1770,7 @@ jobs:
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-cmake-sanitizer-riscv64-native:
|
||||
runs-on: RISCV64
|
||||
|
||||
+2
-2
@@ -1856,7 +1856,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
{"-mm", "--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
||||
"note: if -hf is used, this argument can be omitted",
|
||||
[](common_params & params, const std::string & value) {
|
||||
@@ -1864,7 +1864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
{"-mmu", "--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
|
||||
+69
-25
@@ -12,6 +12,8 @@
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <future>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
@@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
|
||||
#elif defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
static bool is_output_a_tty() {
|
||||
class ProgressBar {
|
||||
static inline std::mutex mutex;
|
||||
static inline std::map<const ProgressBar *, int> lines;
|
||||
static inline int max_line = 0;
|
||||
|
||||
static void cleanup(const ProgressBar * line) {
|
||||
lines.erase(line);
|
||||
if (lines.empty()) {
|
||||
max_line = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_output_a_tty() {
|
||||
#if defined(_WIN32)
|
||||
return _isatty(_fileno(stdout));
|
||||
return _isatty(_fileno(stdout));
|
||||
#else
|
||||
return isatty(1);
|
||||
return isatty(1);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void print_progress(size_t current, size_t total) {
|
||||
if (!is_output_a_tty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!total) {
|
||||
return;
|
||||
public:
|
||||
ProgressBar() = default;
|
||||
|
||||
~ProgressBar() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
cleanup(this);
|
||||
}
|
||||
|
||||
size_t width = 50;
|
||||
size_t pct = (100 * current) / total;
|
||||
size_t pos = (width * current) / total;
|
||||
void update(size_t current, size_t total) {
|
||||
if (!is_output_a_tty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << "["
|
||||
<< std::string(pos, '=')
|
||||
<< (pos < width ? ">" : "")
|
||||
<< std::string(width - pos, ' ')
|
||||
<< "] " << std::setw(3) << pct << "% ("
|
||||
<< current / (1024 * 1024) << " MB / "
|
||||
<< total / (1024 * 1024) << " MB)\r";
|
||||
std::cout.flush();
|
||||
}
|
||||
if (!total) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
if (lines.find(this) == lines.end()) {
|
||||
lines[this] = max_line++;
|
||||
std::cout << "\n";
|
||||
}
|
||||
int lines_up = max_line - lines[this];
|
||||
|
||||
size_t width = 50;
|
||||
size_t pct = (100 * current) / total;
|
||||
size_t pos = (width * current) / total;
|
||||
|
||||
std::cout << "\033[s";
|
||||
|
||||
if (lines_up > 0) {
|
||||
std::cout << "\033[" << lines_up << "A";
|
||||
}
|
||||
std::cout << "\033[2K\r["
|
||||
<< std::string(pos, '=')
|
||||
<< (pos < width ? ">" : "")
|
||||
<< std::string(width - pos, ' ')
|
||||
<< "] " << std::setw(3) << pct << "% ("
|
||||
<< current / (1024 * 1024) << " MB / "
|
||||
<< total / (1024 * 1024) << " MB) "
|
||||
<< "\033[u";
|
||||
|
||||
std::cout.flush();
|
||||
|
||||
if (current == total) {
|
||||
cleanup(this);
|
||||
}
|
||||
}
|
||||
|
||||
ProgressBar(const ProgressBar &) = delete;
|
||||
ProgressBar & operator=(const ProgressBar &) = delete;
|
||||
};
|
||||
|
||||
static bool common_pull_file(httplib::Client & cli,
|
||||
const std::string & resolve_path,
|
||||
@@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
|
||||
const char * func = __func__; // avoid __func__ inside a lambda
|
||||
size_t downloaded = existing_size;
|
||||
size_t progress_step = 0;
|
||||
ProgressBar bar;
|
||||
|
||||
auto res = cli.Get(resolve_path, headers,
|
||||
[&](const httplib::Response &response) {
|
||||
@@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
|
||||
progress_step += len;
|
||||
|
||||
if (progress_step >= total_size / 1000 || downloaded == total_size) {
|
||||
print_progress(downloaded, total_size);
|
||||
bar.update(downloaded, total_size);
|
||||
progress_step = 0;
|
||||
}
|
||||
return true;
|
||||
@@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
|
||||
nullptr
|
||||
);
|
||||
|
||||
std::cout << "\n";
|
||||
|
||||
if (!res) {
|
||||
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
|
||||
return false;
|
||||
|
||||
@@ -7286,6 +7286,10 @@ class DeepseekV2Model(TextModel):
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
@@ -10041,6 +10045,10 @@ class MistralMoeModel(DeepseekV2Model):
|
||||
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
|
||||
yarn_params = self.hparams["yarn"]
|
||||
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
|
||||
|
||||
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
|
||||
# ref https://github.com/ggml-org/llama.cpp/pull/17945
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
|
||||
+3
-3
@@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model
|
||||
or with a server image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||
docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512
|
||||
```
|
||||
|
||||
## Docker With CUDA
|
||||
@@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
|
||||
```bash
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
```
|
||||
|
||||
## Docker With MUSA
|
||||
@@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne
|
||||
```bash
|
||||
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
```
|
||||
|
||||
@@ -32,10 +32,6 @@ def quick_logits_check(pytorch_file, llamacpp_file):
|
||||
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
|
||||
print(f"Max absolute difference: {max_diff:.4f}")
|
||||
|
||||
if max_diff > 1.0:
|
||||
print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def main():
|
||||
|
||||
@@ -99,6 +99,7 @@ extern "C" {
|
||||
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
||||
// other
|
||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
|
||||
+15
-13
@@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
|
||||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
|
||||
static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, addr, tensor);
|
||||
#endif
|
||||
|
||||
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
||||
|
||||
// see if we can merge with an existing block
|
||||
@@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
|
||||
}
|
||||
// otherwise, add a new block
|
||||
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
||||
|
||||
GGML_UNUSED(tensor);
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||
@@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten
|
||||
|
||||
GGML_ASSERT(parent_size >= node_size);
|
||||
|
||||
// note: we want after the freeing the chunks to continue to be aligned
|
||||
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
||||
parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment);
|
||||
node_size = aligned_offset(NULL, node_size, p_alloc->alignment);
|
||||
|
||||
if (parent_size > node_size) {
|
||||
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
|
||||
struct buffer_address p_addr = p_hn->addr;
|
||||
p_addr.offset += node_size;
|
||||
size_t extra_size = parent_size - node_size;
|
||||
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
|
||||
ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
|
||||
ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks);
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, hn->addr, node);
|
||||
#endif
|
||||
|
||||
ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size);
|
||||
hn->allocated = false;
|
||||
}
|
||||
|
||||
|
||||
@@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return true;
|
||||
case GGML_OP_PAD:
|
||||
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
|
||||
return ggml_get_op_params_i32(op, 8) == 0;
|
||||
|
||||
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
|
||||
} ggml_arm_arch_features = { 0 };
|
||||
#endif
|
||||
|
||||
#if defined(__riscv)
|
||||
struct ggml_riscv_arch_features_type {
|
||||
int rvv_vlen;
|
||||
} ggml_riscv_arch_features = { 0 };
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
@@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
|
||||
#endif
|
||||
#endif // __ARM_ARCH
|
||||
|
||||
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
||||
#include <riscv_vector.h>
|
||||
static void ggml_init_riscv_arch_features(void) {
|
||||
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
|
||||
}
|
||||
#else
|
||||
static void ggml_init_riscv_arch_features(void) {}
|
||||
#endif
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
||||
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
||||
|
||||
@@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_get_rvv_vlen(void) {
|
||||
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
||||
return ggml_riscv_arch_features.rvv_vlen;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_f16c(void) {
|
||||
#if defined(__F16C__)
|
||||
return 1;
|
||||
@@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) {
|
||||
ggml_init_arm_arch_features();
|
||||
#endif
|
||||
|
||||
#if defined(__riscv)
|
||||
ggml_init_riscv_arch_features();
|
||||
#endif
|
||||
|
||||
is_first_call = false;
|
||||
}
|
||||
|
||||
|
||||
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_riscv_v()) {
|
||||
features.push_back({ "RISCV_V", "1" });
|
||||
}
|
||||
if (ggml_cpu_get_rvv_vlen() > 0) {
|
||||
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
|
||||
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
|
||||
}
|
||||
if (ggml_cpu_has_vsx()) {
|
||||
features.push_back({ "VSX", "1" });
|
||||
}
|
||||
|
||||
@@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
||||
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
||||
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &q4_0_8x8_q8_0;
|
||||
}
|
||||
|
||||
@@ -67,19 +67,22 @@
|
||||
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
||||
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
||||
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
||||
#define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops.
|
||||
#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
|
||||
|
||||
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
|
||||
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5)
|
||||
#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc))
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
|
||||
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
|
||||
// Moore Threads
|
||||
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
|
||||
|
||||
@@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
||||
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
||||
|
||||
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
|
||||
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
||||
const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
|
||||
@@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||
int bidx = bidx0 - 1;
|
||||
int kbc_stop = kbc0;
|
||||
while(true) {
|
||||
const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
if (kbc == kbc_stop) { // Did not have any data.
|
||||
bidx--;
|
||||
kbc_stop = kbc;
|
||||
|
||||
@@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1;
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||
|
||||
// If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
|
||||
// For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
|
||||
@@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
|
||||
const half * mask_h = ncols2 == 1 && !mask ? nullptr :
|
||||
(const half *) (mask + nb33*(sequence % ne33));
|
||||
(const half *) (mask + nb33*(sequence % ne33));
|
||||
float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);
|
||||
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
|
||||
|
||||
@@ -4630,9 +4630,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CUMSUM:
|
||||
case GGML_OP_TRI:
|
||||
case GGML_OP_DIAG:
|
||||
return true;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -189,6 +189,9 @@ namespace ggml_cuda_mma {
|
||||
return 8 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return 2 * l + (threadIdx.x / 16);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
@@ -290,8 +293,12 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
#if defined(RDNA3)
|
||||
// RDNA3 has duplicated data as input.
|
||||
static constexpr int ne = I * J / 32 * 2;
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
#endif // defined(RDNA3)
|
||||
half2 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
@@ -310,7 +317,14 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
#if defined(RDNA4)
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return l;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
@@ -366,11 +380,16 @@ namespace ggml_cuda_mma {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA3)
|
||||
// RDNA3 has duplicated data as input.
|
||||
static constexpr int ne = I * J / 32 * 2;
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
#endif // defined(RDNA3)
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 8) return true;
|
||||
return false;
|
||||
@@ -387,13 +406,23 @@ namespace ggml_cuda_mma {
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 8) {
|
||||
#if defined(RDNA4)
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
#elif defined(RDNA3)
|
||||
return l;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
#endif // defined(RDNA4)
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 8 && J == 8) return true;
|
||||
if (I == 16 && J == 4) return true;
|
||||
@@ -546,8 +575,14 @@ namespace ggml_cuda_mma {
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
|
||||
#if defined(RDNA4)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
#elif defined(RDNA3)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(RDNA4)
|
||||
} else if constexpr (std::is_same_v<T, int>) {
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
@@ -888,6 +923,16 @@ namespace ggml_cuda_mma {
|
||||
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
||||
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||
#elif defined(RDNA3)
|
||||
using halfx16_t = __attribute__((ext_vector_type(16))) _Float16;
|
||||
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||
const halfx16_t& a_frag = reinterpret_cast<const halfx16_t&>(A.x[0]);
|
||||
const halfx16_t& b_frag = reinterpret_cast<const halfx16_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag);
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // RDNA4
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
@@ -905,6 +950,16 @@ namespace ggml_cuda_mma {
|
||||
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
||||
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||
#elif defined(RDNA3)
|
||||
using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16;
|
||||
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||
const bf16x16_t& a_frag = reinterpret_cast<const bf16x16_t&>(A.x[0]);
|
||||
const bf16x16_t& b_frag = reinterpret_cast<const bf16x16_t&>(B.x[0]);
|
||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag);
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // RDNA4
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B);
|
||||
|
||||
@@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (src1_ncols > 16) {
|
||||
if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) {
|
||||
return false;
|
||||
} else if (src1_ncols > 16) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||
case GGML_TYPE_F32:
|
||||
return ampere_mma_available(cc);
|
||||
case GGML_TYPE_F16:
|
||||
return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
||||
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
|
||||
case GGML_TYPE_BF16:
|
||||
return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
||||
return ampere_mma_available(cc) || amd_wmma_available(cc);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
|
||||
return ne11 <= 8;
|
||||
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
if (fp16_mma_hardware_available(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
return ne11 <= 3;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
return ne11 <= 5;
|
||||
}
|
||||
return ne11 <= 2;
|
||||
|
||||
@@ -3,6 +3,80 @@
|
||||
#include "solve_tri.cuh"
|
||||
|
||||
#define MAX_N_FAST 64
|
||||
#define MAX_K_FAST 32
|
||||
|
||||
static __global__ void get_batch_pointers(const float * A,
|
||||
float * X,
|
||||
const float ** A_ptrs,
|
||||
float ** X_ptrs,
|
||||
int64_t ne02,
|
||||
int64_t total_batches,
|
||||
size_t s02,
|
||||
size_t s03,
|
||||
size_t s2,
|
||||
size_t s3) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx >= total_batches) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i3 = idx / ne02;
|
||||
const int64_t i2 = idx % ne02;
|
||||
|
||||
A_ptrs[idx] = A + i3 * s03 + i2 * s02;
|
||||
X_ptrs[idx] = X + i3 * s3 + i2 * s2;
|
||||
}
|
||||
|
||||
static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
|
||||
const float * A,
|
||||
const float * B,
|
||||
float * X,
|
||||
int n,
|
||||
int k,
|
||||
int64_t ne02,
|
||||
int64_t ne03,
|
||||
size_t s02,
|
||||
size_t s03,
|
||||
size_t s12,
|
||||
size_t s13,
|
||||
size_t s2,
|
||||
size_t s3,
|
||||
cudaStream_t stream) {
|
||||
const float alpha = 1.0f;
|
||||
const int64_t total_batches = ne02 * ne03;
|
||||
if (total_batches == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Bulk copy B -> X (contiguous tensors)
|
||||
if (X != B) {
|
||||
const int64_t total_elements_BX = n * k * total_batches;
|
||||
CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
|
||||
ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
|
||||
ggml_cuda_pool_alloc<float *> X_ptrs_alloc(ctx.pool(id), total_batches);
|
||||
|
||||
const float ** A_ptrs_dev = A_ptrs_alloc.get();
|
||||
float ** X_ptrs_dev = X_ptrs_alloc.get();
|
||||
|
||||
get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02,
|
||||
total_batches, s02, s03, s2, s3);
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
|
||||
// Yes, this is necessary, without this we get RMSE errors
|
||||
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
|
||||
CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||
CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches));
|
||||
|
||||
// revert to standard mode from common.cuh
|
||||
CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
|
||||
|
||||
GGML_UNUSED_VARS(s12, s13);
|
||||
}
|
||||
|
||||
// ======================
|
||||
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
|
||||
@@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
||||
float x_low = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
|
||||
float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
|
||||
|
||||
const int half = WARP_SIZE;
|
||||
const int half = WARP_SIZE;
|
||||
const int nrows_low = (n < half) ? n : half;
|
||||
|
||||
#pragma unroll
|
||||
@@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
||||
|
||||
#pragma unroll
|
||||
for (int row = half; row < n; ++row) {
|
||||
float sum = sA[row * n + lane] * x_low;
|
||||
const int j = half + lane;
|
||||
float sum = sA[row * n + lane] * x_low;
|
||||
const int j = half + lane;
|
||||
if (j < row) {
|
||||
sum += sA[row * n + j] * x_high;
|
||||
}
|
||||
@@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
||||
for (int rr = 0; rr < 2; ++rr) {
|
||||
const int row = rr * WARP_SIZE + lane;
|
||||
if (row < n) {
|
||||
const float val = (row < half) ? x_low : x_high;
|
||||
const float val = (row < half) ? x_low : x_high;
|
||||
X_batch[row * k + col_idx] = val;
|
||||
}
|
||||
}
|
||||
@@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A,
|
||||
}
|
||||
|
||||
void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0]; // A (triangular n x x matrix)
|
||||
const ggml_tensor * src1 = dst->src[1]; // B (right hand side of n x k equation columns)
|
||||
const ggml_tensor * src0 = dst->src[0]; // A (n×n, lower triangular)
|
||||
const ggml_tensor * src1 = dst->src[1]; // B (n×k)
|
||||
|
||||
ggml_is_contiguous(src0);
|
||||
ggml_is_contiguous(src1);
|
||||
|
||||
const int64_t n = src0->ne[0];
|
||||
const int64_t k = src1->ne[0];
|
||||
const int64_t n = src0->ne[0];
|
||||
const int64_t k = src1->ne[0];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
GGML_ASSERT(n <= 64);
|
||||
GGML_ASSERT(k <= 32);
|
||||
|
||||
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
|
||||
src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||
dst->nb[3] / sizeof(float), ctx.stream());
|
||||
if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
|
||||
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
|
||||
src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||
dst->nb[3] / sizeof(float), ctx.stream());
|
||||
} else {
|
||||
solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k,
|
||||
ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
|
||||
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
|
||||
dst->nb[3] / sizeof(float), ctx.stream());
|
||||
}
|
||||
}
|
||||
|
||||
Vendored
+4
@@ -19,6 +19,9 @@
|
||||
#define CUDA_R_16F HIPBLAS_R_16F
|
||||
#define CUDA_R_16BF HIPBLAS_R_16B
|
||||
#define CUDA_R_32F HIPBLAS_R_32F
|
||||
#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
|
||||
#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
|
||||
#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
||||
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
||||
@@ -30,6 +33,7 @@
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define __all_sync(mask, var) __all(var)
|
||||
#define __any_sync(mask, var) __any(var)
|
||||
#define cublasStrsmBatched hipblasStrsmBatched
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasDestroy hipblasDestroy
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
|
||||
Vendored
+5
@@ -12,11 +12,16 @@
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||
#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
|
||||
#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
|
||||
#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
|
||||
#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
|
||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
||||
#define CUDA_R_16F MUSA_R_16F
|
||||
#define CUDA_R_16BF MUSA_R_16BF
|
||||
#define CUDA_R_32F MUSA_R_32F
|
||||
#define cublasStrsmBatched mublasStrsmBatched
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
#define cublasCreate mublasCreate
|
||||
#define cublasDestroy mublasDestroy
|
||||
|
||||
@@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
return (1 - MIN(1, MAX(0, y)));
|
||||
}
|
||||
|
||||
static void rope_cache_init(const float theta_base,
|
||||
float freq_scale,
|
||||
const float * freq_factors,
|
||||
float * corr_dims,
|
||||
uint32_t ne0,
|
||||
float ext_factor,
|
||||
float mscale,
|
||||
float * cache,
|
||||
float theta_scale) {
|
||||
static void rope_cache_init(const float theta_base,
|
||||
const float freq_scale,
|
||||
const float * freq_factors,
|
||||
float * corr_dims,
|
||||
const uint32_t ne0,
|
||||
const float ext_factor,
|
||||
const float mscale,
|
||||
float * cache,
|
||||
const float theta_scale) {
|
||||
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||
float theta = theta_base;
|
||||
|
||||
@@ -92,18 +92,19 @@ static void rope_cache_init(const float theta_base,
|
||||
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta2 = theta_interp;
|
||||
float theta_final = theta_interp;
|
||||
float mscale_final = mscale;
|
||||
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||
mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||
}
|
||||
|
||||
cache[i0 + 0] = cosf(theta2) * mscale;
|
||||
cache[i0 + 1] = sinf(theta2) * mscale;
|
||||
cache[i0 + 0] = cosf(theta_final) * mscale_final;
|
||||
cache[i0 + 1] = sinf(theta_final) * mscale_final;
|
||||
|
||||
theta *= theta_scale;
|
||||
}
|
||||
@@ -151,9 +152,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
|
||||
}
|
||||
|
||||
static void hvx_calc_rope_neox_f32(const float * restrict src0,
|
||||
float * restrict dst,
|
||||
const int num_elems,
|
||||
const float * restrict theta_cache) {
|
||||
float * restrict dst,
|
||||
const int num_elems,
|
||||
const float * restrict theta_cache) {
|
||||
// for (int i = 0; i < num_elems; i += 2) {
|
||||
//const float cos_theta = theta_cache[i + 0];
|
||||
//const float sin_theta = theta_cache[i + 1];
|
||||
@@ -192,7 +193,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
|
||||
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
|
||||
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
|
||||
|
||||
src0_curr += VLEN;
|
||||
@@ -259,7 +260,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||
const uint32_t ir1,
|
||||
int nth,
|
||||
int ith,
|
||||
int opt_path) {
|
||||
const int opt_path) {
|
||||
struct htp_ops_context * octx = rope_ctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = &octx->src0;
|
||||
@@ -267,8 +268,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||
const struct htp_tensor * src2 = &octx->src2;
|
||||
struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
const int32_t mode = rope_ctx->mode;
|
||||
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
|
||||
const int32_t mode = rope_ctx->mode;
|
||||
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
|
||||
|
||||
htp_rope_preamble;
|
||||
|
||||
@@ -281,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||
freq_factors = (const float *) src2->data;
|
||||
}
|
||||
|
||||
int ir = 0;
|
||||
|
||||
const uint32_t i1_end = MIN(ir1, ne1);
|
||||
const int32_t half_dims = rope_ctx->n_dims / 2;
|
||||
const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
|
||||
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
|
||||
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
||||
const int32_t p = pos[i2];
|
||||
@@ -290,14 +292,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
|
||||
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
|
||||
|
||||
for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads
|
||||
if (ir++ < ir0) {
|
||||
continue;
|
||||
}
|
||||
if (ir > ir1) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads
|
||||
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
|
||||
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
|
||||
|
||||
@@ -310,6 +305,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||
} else {
|
||||
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
||||
}
|
||||
|
||||
src_loc += rope_ctx->n_dims;
|
||||
dst_data_loc += rope_ctx->n_dims;
|
||||
} else {
|
||||
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
|
||||
const float cos_theta = wp0[i0 + 0];
|
||||
@@ -317,10 +315,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||
|
||||
if (is_neox) {
|
||||
const float x0 = src_loc[0];
|
||||
const float x1 = src_loc[rope_ctx->n_dims/2];
|
||||
const float x1 = src_loc[half_dims];
|
||||
|
||||
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
|
||||
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
|
||||
|
||||
src_loc += 1;
|
||||
dst_data_loc += 1;
|
||||
@@ -335,15 +333,13 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
||||
dst_data_loc += 2;
|
||||
}
|
||||
}
|
||||
|
||||
src_loc += (is_neox ? half_dims : 0);
|
||||
dst_data_loc += (is_neox ? half_dims : 0);
|
||||
}
|
||||
|
||||
for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
|
||||
dst_data_loc[0] = src_loc[0];
|
||||
dst_data_loc[1] = src_loc[1];
|
||||
|
||||
src_loc += 2;
|
||||
dst_data_loc += 2;
|
||||
}
|
||||
// TODO: use simd to speed up the remaining elements copy
|
||||
memcpy(dst_data_loc, src_loc, remain_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+12
-2
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||
udata->output .resize(n_tokens);
|
||||
|
||||
udata->seq_id_data.reserve(n_tokens);
|
||||
|
||||
seq_set_t seq_set_unq;
|
||||
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
}
|
||||
|
||||
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||
udata->seq_id[i] = batch.seq_id[idxs[i]];
|
||||
udata->output[i] = batch.logits[idxs[i]];
|
||||
|
||||
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
||||
seq_set_unq.set(udata->seq_id[i][s]);
|
||||
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
|
||||
|
||||
udata->seq_id_data.push_back(seq_id);
|
||||
seq_set_unq.set(seq_id);
|
||||
}
|
||||
|
||||
if (udata->output[i]) {
|
||||
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
}
|
||||
}
|
||||
|
||||
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
udata->seq_id[i] = seq_id_ptr;
|
||||
seq_id_ptr += udata->n_seq_id[i];
|
||||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_set_unq.test(s)) {
|
||||
udata->seq_idx[s] = udata->seq_id_unq.size();
|
||||
|
||||
+4
-2
@@ -56,13 +56,15 @@ struct llama_ubatch {
|
||||
std::vector<float> embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
|
||||
std::vector<llama_seq_id> seq_id_unq;
|
||||
std::vector<int32_t> seq_idx;
|
||||
std::vector<int8_t> output;
|
||||
|
||||
std::vector<llama_seq_id> seq_id_data;
|
||||
};
|
||||
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
|
||||
std::shared_ptr<data_t> data;
|
||||
};
|
||||
|
||||
|
||||
+1
-1
@@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
freq_base (cparams.rope_freq_base),
|
||||
freq_scale (cparams.rope_freq_scale),
|
||||
ext_factor (cparams.yarn_ext_factor),
|
||||
attn_factor (cparams.yarn_attn_factor),
|
||||
attn_factor (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
|
||||
beta_fast (cparams.yarn_beta_fast),
|
||||
beta_slow (cparams.yarn_beta_slow),
|
||||
norm_eps (hparams.f_norm_eps),
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
@@ -229,3 +231,13 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
|
||||
GGML_ASSERT(ext_factor >= 0.0f);
|
||||
|
||||
if (ext_factor != 0.0f) {
|
||||
attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
}
|
||||
|
||||
return attn_factor;
|
||||
}
|
||||
|
||||
+8
-1
@@ -107,6 +107,7 @@ struct llama_hparams {
|
||||
float rope_freq_base_train_swa;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
@@ -267,7 +268,13 @@ struct llama_hparams {
|
||||
// TODO: think of a better place for this function
|
||||
// TODO: pack the SWA params in a struct?
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||
|
||||
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
||||
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
||||
// https://github.com/ggml-org/llama.cpp/pull/17945
|
||||
static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
||||
|
||||
@@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||
float freq_scale) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||
@@ -1382,12 +1383,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||
? LLAMA_ROPE_TYPE_NEOX
|
||||
: hparams.rope_type;
|
||||
|
||||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
|
||||
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
|
||||
: cparams.yarn_attn_factor;
|
||||
|
||||
ggml_tensor * tmp;
|
||||
|
||||
if (ggml_is_quantized(cur->type)) {
|
||||
|
||||
+36
-17
@@ -1635,7 +1635,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// that have no expert_gating_func model parameter set
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
||||
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
// cancel the factor from the convert script
|
||||
hparams.rope_yarn_log_mul /= 0.1f;
|
||||
}
|
||||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
@@ -2267,9 +2272,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
|
||||
|
||||
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
||||
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||
@@ -2279,18 +2284,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
||||
// but may need further verification with other values
|
||||
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||
float factor = 1.0f / hparams.rope_freq_scale_train;
|
||||
float mscale = 1.0f;
|
||||
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||
static auto get_mscale = [](float scale, float mscale) {
|
||||
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||
};
|
||||
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 26: type = LLM_TYPE_3B; break;
|
||||
case 34: type = LLM_TYPE_8B; break;
|
||||
@@ -2301,6 +2294,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
||||
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||
const float factor = 1.0f / hparams.rope_freq_scale_train;
|
||||
|
||||
// note: here we assume `mscale == 1.0f`
|
||||
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
||||
float mscale = 1.0f;
|
||||
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||
|
||||
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
// special-case DEEPSEEK v2:
|
||||
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
||||
mscale = mscale_all_dims;
|
||||
}
|
||||
|
||||
static auto get_mscale = [](float scale, float mscale) {
|
||||
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||
};
|
||||
|
||||
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||
|
||||
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
||||
__func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
|
||||
}
|
||||
|
||||
pimpl->n_bytes = ml.n_bytes;
|
||||
|
||||
pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
|
||||
@@ -6806,6 +6825,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
||||
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
||||
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
||||
@@ -6869,7 +6889,6 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_QWEN2MOE) {
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
|
||||
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
||||
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
||||
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
|
||||
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
|
||||
GGML_ASSERT(ext_factor >= 0.0f);
|
||||
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
|
||||
// use the original attn_factor to pre-scale the kq_scale
|
||||
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
||||
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
@@ -7861,9 +7861,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
|
||||
|
||||
for (bool v : {false, true}) {
|
||||
for (bool circular : {false, true}) {
|
||||
@@ -8064,12 +8079,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
|
||||
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
|
||||
// qwen3next with CHUNK_SIZE 64
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
|
||||
// qwen3next with CHUNK_SIZE 128
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
|
||||
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
|
||||
|
||||
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
|
||||
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
|
||||
|
||||
@@ -53,6 +53,15 @@ if (TARGET BUILD_INFO)
|
||||
add_dependencies(mtmd-helper BUILD_INFO)
|
||||
endif()
|
||||
|
||||
# if mtmd is linked against common, we throw an error
|
||||
if (TARGET mtmd)
|
||||
get_target_property(libs mtmd LINK_LIBRARIES)
|
||||
if (libs AND "common" IN_LIST libs)
|
||||
message(FATAL_ERROR "mtmd is designed to be a public library.\n"
|
||||
"It must not link against common")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
|
||||
// Internal header for clip.cpp
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
|
||||
+91
-61
@@ -595,11 +595,12 @@ struct clip_graph {
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
|
||||
cur = build_ffn(cur,
|
||||
@@ -667,16 +668,12 @@ struct clip_graph {
|
||||
|
||||
// LlavaMultiModalProjector (always using GELU activation)
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
if (model.mm_1_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
}
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
if (model.mm_2_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
}
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// arrangement of the [IMG_BREAK] token
|
||||
@@ -866,16 +863,12 @@ struct clip_graph {
|
||||
// multimodal projection
|
||||
ggml_tensor * embeddings = inpL;
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
if (use_window_attn) {
|
||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
@@ -1253,11 +1246,12 @@ struct clip_graph {
|
||||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_3_w, model.mm_3_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
@@ -1408,11 +1402,12 @@ struct clip_graph {
|
||||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
@@ -1883,9 +1878,12 @@ struct clip_graph {
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else {
|
||||
GGML_ABORT("%s: unknown projector type", __func__);
|
||||
@@ -2070,34 +2068,66 @@ private:
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
ggml_tensor * Qcur = nullptr;
|
||||
ggml_tensor * Kcur = nullptr;
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
if (layer.qkv_w != nullptr) {
|
||||
// fused qkv
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
if (layer.qkv_b != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ 0);
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, n_embd));
|
||||
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
|
||||
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
}
|
||||
// TODO: q/k norm requires row size == n_embd, while here it's d_head
|
||||
// we can add support in the future if needed
|
||||
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
} else {
|
||||
// separate q, k, v
|
||||
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
if (layer.q_norm) {
|
||||
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
}
|
||||
|
||||
if (layer.k_norm) {
|
||||
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
|
||||
// !!! Internal header, to be used by mtmd only !!!
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
struct clip_image_size {
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
#define WHISPER_ASSERT GGML_ASSERT
|
||||
|
||||
#define WHISPER_SAMPLE_RATE 16000
|
||||
|
||||
@@ -318,7 +318,9 @@ int main(int argc, char ** argv) {
|
||||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
// most models require the marker before each image
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
|
||||
params.prompt = mtmd_default_marker() + params.prompt;
|
||||
}
|
||||
}
|
||||
common_chat_msg msg;
|
||||
|
||||
@@ -32,6 +32,10 @@
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb/stb_image.h"
|
||||
|
||||
#ifdef MTMD_INTERNAL_HEADER
|
||||
#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
|
||||
#endif
|
||||
|
||||
//
|
||||
// internal logging functions
|
||||
//
|
||||
|
||||
+32
-19
@@ -32,23 +32,32 @@ fi
|
||||
|
||||
arr_prefix=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
arr_extra_args=()
|
||||
arr_file=()
|
||||
|
||||
add_test_vision() {
|
||||
local hf=$1
|
||||
local tmpl=${2:-""} # default to empty string if not provided
|
||||
shift
|
||||
local extra_args=""
|
||||
if [ $# -gt 0 ]; then
|
||||
extra_args=$(printf " %q" "$@")
|
||||
fi
|
||||
arr_prefix+=("[vision]")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
arr_extra_args+=("$extra_args")
|
||||
arr_file+=("test-1.jpeg")
|
||||
}
|
||||
|
||||
add_test_audio() {
|
||||
local hf=$1
|
||||
shift
|
||||
local extra_args=""
|
||||
if [ $# -gt 0 ]; then
|
||||
extra_args=$(printf " %q" "$@")
|
||||
fi
|
||||
arr_prefix+=("[audio] ")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("") # no need for chat tmpl
|
||||
arr_extra_args+=("$extra_args")
|
||||
arr_file+=("test-2.mp3")
|
||||
}
|
||||
|
||||
@@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
|
||||
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
@@ -79,7 +88,7 @@ add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
|
||||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
|
||||
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
@@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
@@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
|
||||
bin="llama-mtmd-cli"
|
||||
prefix="${arr_prefix[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
extra_args="${arr_extra_args[$i]}"
|
||||
inp_file="${arr_file[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/$inp_file \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
|
||||
-hf $(printf %q "$hf") \
|
||||
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
2>&1 | tee /dev/tty)
|
||||
${extra_args}"
|
||||
|
||||
# if extra_args does not contain -p, we add a default prompt
|
||||
if ! [[ "$extra_args" =~ "-p" ]]; then
|
||||
cmd+=" -p \"what is the publisher name of the newspaper?\""
|
||||
fi
|
||||
|
||||
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
|
||||
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
@@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
|
||||
if echo "$output" | grep -iq "new york" \
|
||||
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
|
||||
then
|
||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
||||
result="$prefix \033[32mOK\033[0m: $hf"
|
||||
else
|
||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
||||
result="$prefix \033[31mFAIL\033[0m: $hf"
|
||||
fi
|
||||
echo -e "$result"
|
||||
arr_res+=("$result")
|
||||
|
||||
@@ -166,8 +166,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||
| `--mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
||||
| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
||||
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
|
||||
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
|
||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||
|
||||
Binary file not shown.
Generated
+22
-14
@@ -41,7 +41,7 @@
|
||||
"@tailwindcss/vite": "^4.0.0",
|
||||
"@types/node": "^22",
|
||||
"@vitest/browser": "^3.2.3",
|
||||
"bits-ui": "^2.8.11",
|
||||
"bits-ui": "^2.14.4",
|
||||
"clsx": "^2.1.1",
|
||||
"dexie": "^4.0.11",
|
||||
"eslint": "^9.18.0",
|
||||
@@ -3343,17 +3343,17 @@
|
||||
}
|
||||
},
|
||||
"node_modules/bits-ui": {
|
||||
"version": "2.8.11",
|
||||
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz",
|
||||
"integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==",
|
||||
"version": "2.14.4",
|
||||
"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
|
||||
"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/core": "^1.7.1",
|
||||
"@floating-ui/dom": "^1.7.1",
|
||||
"esm-env": "^1.1.2",
|
||||
"runed": "^0.29.1",
|
||||
"svelte-toolbelt": "^0.9.3",
|
||||
"runed": "^0.35.1",
|
||||
"svelte-toolbelt": "^0.10.6",
|
||||
"tabbable": "^6.2.0"
|
||||
},
|
||||
"engines": {
|
||||
@@ -3368,9 +3368,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/bits-ui/node_modules/runed": {
|
||||
"version": "0.29.2",
|
||||
"resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz",
|
||||
"integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==",
|
||||
"version": "0.35.1",
|
||||
"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
|
||||
"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/sponsors/huntabyte",
|
||||
@@ -3378,23 +3378,31 @@
|
||||
],
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"esm-env": "^1.0.0"
|
||||
"dequal": "^2.0.3",
|
||||
"esm-env": "^1.0.0",
|
||||
"lz-string": "^1.5.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@sveltejs/kit": "^2.21.0",
|
||||
"svelte": "^5.7.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@sveltejs/kit": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bits-ui/node_modules/svelte-toolbelt": {
|
||||
"version": "0.9.3",
|
||||
"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz",
|
||||
"integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==",
|
||||
"version": "0.10.6",
|
||||
"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
|
||||
"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/sponsors/huntabyte"
|
||||
],
|
||||
"dependencies": {
|
||||
"clsx": "^2.1.1",
|
||||
"runed": "^0.29.0",
|
||||
"runed": "^0.35.1",
|
||||
"style-to-object": "^1.0.8"
|
||||
},
|
||||
"engines": {
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
"@tailwindcss/vite": "^4.0.0",
|
||||
"@types/node": "^22",
|
||||
"@vitest/browser": "^3.2.3",
|
||||
"bits-ui": "^2.8.11",
|
||||
"bits-ui": "^2.14.4",
|
||||
"clsx": "^2.1.1",
|
||||
"dexie": "^4.0.11",
|
||||
"eslint": "^9.18.0",
|
||||
|
||||
@@ -331,6 +331,7 @@
|
||||
class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
|
||||
? 'cursor-not-allowed opacity-60'
|
||||
: ''} {className}"
|
||||
data-slot="chat-form"
|
||||
>
|
||||
<ChatAttachmentsList
|
||||
bind:uploadedFiles
|
||||
|
||||
+2
-16
@@ -1,6 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { Input } from '$lib/components/ui/input';
|
||||
import { Search } from '@lucide/svelte';
|
||||
import { SearchInput } from '$lib/components/app';
|
||||
|
||||
interface Props {
|
||||
value?: string;
|
||||
@@ -15,19 +14,6 @@
|
||||
onInput,
|
||||
class: className
|
||||
}: Props = $props();
|
||||
|
||||
function handleInput(event: Event) {
|
||||
const target = event.target as HTMLInputElement;
|
||||
|
||||
value = target.value;
|
||||
onInput?.(target.value);
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="relative mb-4 {className}">
|
||||
<Search
|
||||
class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
|
||||
/>
|
||||
|
||||
<Input bind:value class="pl-10" oninput={handleInput} {placeholder} type="search" />
|
||||
</div>
|
||||
<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
|
||||
|
||||
@@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt
|
||||
export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
|
||||
export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
|
||||
export { default as RemoveButton } from './misc/RemoveButton.svelte';
|
||||
export { default as SearchInput } from './misc/SearchInput.svelte';
|
||||
export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
|
||||
export { default as ModelsSelector } from './models/ModelsSelector.svelte';
|
||||
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
<script lang="ts">
|
||||
import { Input } from '$lib/components/ui/input';
|
||||
import { Search, X } from '@lucide/svelte';
|
||||
|
||||
interface Props {
|
||||
value?: string;
|
||||
placeholder?: string;
|
||||
onInput?: (value: string) => void;
|
||||
onClose?: () => void;
|
||||
onKeyDown?: (event: KeyboardEvent) => void;
|
||||
class?: string;
|
||||
id?: string;
|
||||
ref?: HTMLInputElement | null;
|
||||
}
|
||||
|
||||
let {
|
||||
value = $bindable(''),
|
||||
placeholder = 'Search...',
|
||||
onInput,
|
||||
onClose,
|
||||
onKeyDown,
|
||||
class: className,
|
||||
id,
|
||||
ref = $bindable(null)
|
||||
}: Props = $props();
|
||||
|
||||
let showClearButton = $derived(!!value || !!onClose);
|
||||
|
||||
function handleInput(event: Event) {
|
||||
const target = event.target as HTMLInputElement;
|
||||
|
||||
value = target.value;
|
||||
onInput?.(target.value);
|
||||
}
|
||||
|
||||
function handleClear() {
|
||||
if (value) {
|
||||
value = '';
|
||||
onInput?.('');
|
||||
ref?.focus();
|
||||
} else {
|
||||
onClose?.();
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="relative {className}">
|
||||
<Search
|
||||
class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
|
||||
/>
|
||||
|
||||
<Input
|
||||
{id}
|
||||
bind:value
|
||||
bind:ref
|
||||
class="pl-9 {showClearButton ? 'pr-9' : ''}"
|
||||
oninput={handleInput}
|
||||
onkeydown={onKeyDown}
|
||||
{placeholder}
|
||||
type="search"
|
||||
/>
|
||||
|
||||
{#if showClearButton}
|
||||
<button
|
||||
type="button"
|
||||
class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
|
||||
onclick={handleClear}
|
||||
aria-label={value ? 'Clear search' : 'Close'}
|
||||
>
|
||||
<X class="h-4 w-4" />
|
||||
</button>
|
||||
{/if}
|
||||
</div>
|
||||
@@ -2,8 +2,8 @@
|
||||
import { onMount, tick } from 'svelte';
|
||||
import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
|
||||
import * as Tooltip from '$lib/components/ui/tooltip';
|
||||
import * as Popover from '$lib/components/ui/popover';
|
||||
import { cn } from '$lib/components/ui/utils';
|
||||
import { portalToBody } from '$lib/utils';
|
||||
import {
|
||||
modelsStore,
|
||||
modelOptions,
|
||||
@@ -17,12 +17,8 @@
|
||||
import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
|
||||
import { ServerModelStatus } from '$lib/enums';
|
||||
import { isRouterMode } from '$lib/stores/server.svelte';
|
||||
import { DialogModelInformation } from '$lib/components/app';
|
||||
import {
|
||||
MENU_MAX_WIDTH,
|
||||
MENU_OFFSET,
|
||||
VIEWPORT_GUTTER
|
||||
} from '$lib/constants/floating-ui-constraints';
|
||||
import { DialogModelInformation, SearchInput } from '$lib/components/app';
|
||||
import type { ModelOption } from '$lib/types/models';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
@@ -145,185 +141,126 @@
|
||||
return options.some((option) => option.model === currentModel);
|
||||
});
|
||||
|
||||
let isOpen = $state(false);
|
||||
let showModelDialog = $state(false);
|
||||
let container: HTMLDivElement | null = null;
|
||||
let menuRef = $state<HTMLDivElement | null>(null);
|
||||
let triggerButton = $state<HTMLButtonElement | null>(null);
|
||||
let menuPosition = $state<{
|
||||
top: number;
|
||||
left: number;
|
||||
width: number;
|
||||
placement: 'top' | 'bottom';
|
||||
maxHeight: number;
|
||||
} | null>(null);
|
||||
let searchTerm = $state('');
|
||||
let searchInputRef = $state<HTMLInputElement | null>(null);
|
||||
let highlightedIndex = $state<number>(-1);
|
||||
|
||||
onMount(async () => {
|
||||
try {
|
||||
await modelsStore.fetch();
|
||||
} catch (error) {
|
||||
console.error('Unable to load models:', error);
|
||||
}
|
||||
let filteredOptions: ModelOption[] = $derived(
|
||||
(() => {
|
||||
const term = searchTerm.trim().toLowerCase();
|
||||
if (!term) return options;
|
||||
|
||||
return options.filter(
|
||||
(option) =>
|
||||
option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
|
||||
);
|
||||
})()
|
||||
);
|
||||
|
||||
// Get indices of compatible options for keyboard navigation
|
||||
let compatibleIndices = $derived(
|
||||
filteredOptions
|
||||
.map((option, index) => (isModelCompatible(option) ? index : -1))
|
||||
.filter((i) => i !== -1)
|
||||
);
|
||||
|
||||
// Reset highlighted index when search term changes
|
||||
$effect(() => {
|
||||
void searchTerm;
|
||||
highlightedIndex = -1;
|
||||
});
|
||||
|
||||
function toggleOpen() {
|
||||
let isOpen = $state(false);
|
||||
let showModelDialog = $state(false);
|
||||
|
||||
onMount(() => {
|
||||
modelsStore.fetch().catch((error) => {
|
||||
console.error('Unable to load models:', error);
|
||||
});
|
||||
});
|
||||
|
||||
function handleOpenChange(open: boolean) {
|
||||
if (loading || updating) return;
|
||||
|
||||
if (isRouter) {
|
||||
// Router mode: show dropdown
|
||||
if (isOpen) {
|
||||
closeMenu();
|
||||
} else {
|
||||
openMenu();
|
||||
if (open) {
|
||||
isOpen = true;
|
||||
searchTerm = '';
|
||||
highlightedIndex = -1;
|
||||
|
||||
// Focus search input after popover opens
|
||||
tick().then(() => {
|
||||
requestAnimationFrame(() => searchInputRef?.focus());
|
||||
});
|
||||
|
||||
if (isRouter) {
|
||||
modelsStore.fetchRouterModels().then(() => {
|
||||
modelsStore.fetchModalitiesForLoadedModels();
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Single model mode: show dialog
|
||||
showModelDialog = true;
|
||||
isOpen = false;
|
||||
searchTerm = '';
|
||||
highlightedIndex = -1;
|
||||
}
|
||||
}
|
||||
|
||||
async function openMenu() {
|
||||
function handleTriggerClick() {
|
||||
if (loading || updating) return;
|
||||
|
||||
isOpen = true;
|
||||
await tick();
|
||||
updateMenuPosition();
|
||||
requestAnimationFrame(() => updateMenuPosition());
|
||||
|
||||
if (isRouter) {
|
||||
modelsStore.fetchRouterModels().then(() => {
|
||||
modelsStore.fetchModalitiesForLoadedModels();
|
||||
});
|
||||
if (!isRouter) {
|
||||
// Single model mode: show dialog instead of popover
|
||||
showModelDialog = true;
|
||||
}
|
||||
// For router mode, the Popover handles open/close
|
||||
}
|
||||
|
||||
export function open() {
|
||||
if (isRouter) {
|
||||
openMenu();
|
||||
handleOpenChange(true);
|
||||
} else {
|
||||
showModelDialog = true;
|
||||
}
|
||||
}
|
||||
|
||||
function closeMenu() {
|
||||
if (!isOpen) return;
|
||||
|
||||
isOpen = false;
|
||||
menuPosition = null;
|
||||
handleOpenChange(false);
|
||||
}
|
||||
|
||||
function handlePointerDown(event: PointerEvent) {
|
||||
if (!container) return;
|
||||
function handleSearchKeyDown(event: KeyboardEvent) {
|
||||
if (event.isComposing) return;
|
||||
|
||||
const target = event.target as Node | null;
|
||||
if (event.key === 'ArrowDown') {
|
||||
event.preventDefault();
|
||||
if (compatibleIndices.length === 0) return;
|
||||
|
||||
if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
|
||||
closeMenu();
|
||||
}
|
||||
}
|
||||
|
||||
function handleKeydown(event: KeyboardEvent) {
|
||||
if (event.key === 'Escape') {
|
||||
closeMenu();
|
||||
}
|
||||
}
|
||||
|
||||
function handleResize() {
|
||||
if (isOpen) {
|
||||
updateMenuPosition();
|
||||
}
|
||||
}
|
||||
|
||||
function updateMenuPosition() {
|
||||
if (!isOpen || !triggerButton || !menuRef) return;
|
||||
|
||||
const triggerRect = triggerButton.getBoundingClientRect();
|
||||
const viewportWidth = window.innerWidth;
|
||||
const viewportHeight = window.innerHeight;
|
||||
|
||||
if (viewportWidth === 0 || viewportHeight === 0) return;
|
||||
|
||||
const scrollWidth = menuRef.scrollWidth;
|
||||
const scrollHeight = menuRef.scrollHeight;
|
||||
|
||||
const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
|
||||
const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
|
||||
const safeMaxWidth =
|
||||
constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
|
||||
const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
|
||||
|
||||
let width = Math.min(
|
||||
Math.max(triggerRect.width, scrollWidth, desiredMinWidth),
|
||||
safeMaxWidth || 320
|
||||
);
|
||||
|
||||
const availableBelow = Math.max(
|
||||
0,
|
||||
viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
|
||||
);
|
||||
const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
|
||||
const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
|
||||
const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
|
||||
|
||||
function computePlacement(placement: 'top' | 'bottom') {
|
||||
const available = placement === 'bottom' ? availableBelow : availableAbove;
|
||||
const allowedHeight =
|
||||
available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
|
||||
const maxHeight = Math.min(scrollHeight, allowedHeight);
|
||||
const height = Math.max(0, maxHeight);
|
||||
|
||||
let top: number;
|
||||
if (placement === 'bottom') {
|
||||
const rawTop = triggerRect.bottom + MENU_OFFSET;
|
||||
const minTop = VIEWPORT_GUTTER;
|
||||
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
|
||||
if (maxTop < minTop) {
|
||||
top = minTop;
|
||||
} else {
|
||||
top = Math.min(Math.max(rawTop, minTop), maxTop);
|
||||
}
|
||||
const currentPos = compatibleIndices.indexOf(highlightedIndex);
|
||||
if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
|
||||
highlightedIndex = compatibleIndices[0];
|
||||
} else {
|
||||
const rawTop = triggerRect.top - MENU_OFFSET - height;
|
||||
const minTop = VIEWPORT_GUTTER;
|
||||
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
|
||||
if (maxTop < minTop) {
|
||||
top = minTop;
|
||||
} else {
|
||||
top = Math.max(Math.min(rawTop, maxTop), minTop);
|
||||
highlightedIndex = compatibleIndices[currentPos + 1];
|
||||
}
|
||||
} else if (event.key === 'ArrowUp') {
|
||||
event.preventDefault();
|
||||
if (compatibleIndices.length === 0) return;
|
||||
|
||||
const currentPos = compatibleIndices.indexOf(highlightedIndex);
|
||||
if (currentPos === -1 || currentPos === 0) {
|
||||
highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
|
||||
} else {
|
||||
highlightedIndex = compatibleIndices[currentPos - 1];
|
||||
}
|
||||
} else if (event.key === 'Enter') {
|
||||
event.preventDefault();
|
||||
if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
|
||||
const option = filteredOptions[highlightedIndex];
|
||||
if (isModelCompatible(option)) {
|
||||
handleSelect(option.id);
|
||||
}
|
||||
}
|
||||
|
||||
return { placement, top, height, maxHeight };
|
||||
}
|
||||
|
||||
const belowMetrics = computePlacement('bottom');
|
||||
const aboveMetrics = computePlacement('top');
|
||||
|
||||
let metrics = belowMetrics;
|
||||
if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
|
||||
metrics = aboveMetrics;
|
||||
}
|
||||
|
||||
let left = triggerRect.right - width;
|
||||
const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
|
||||
if (maxLeft < VIEWPORT_GUTTER) {
|
||||
left = VIEWPORT_GUTTER;
|
||||
} else {
|
||||
if (left > maxLeft) {
|
||||
left = maxLeft;
|
||||
}
|
||||
if (left < VIEWPORT_GUTTER) {
|
||||
left = VIEWPORT_GUTTER;
|
||||
} else if (compatibleIndices.length > 0) {
|
||||
// No selection - highlight first compatible option
|
||||
highlightedIndex = compatibleIndices[0];
|
||||
}
|
||||
}
|
||||
|
||||
menuPosition = {
|
||||
top: Math.round(metrics.top),
|
||||
left: Math.round(left),
|
||||
width: Math.round(width),
|
||||
placement: metrics.placement,
|
||||
maxHeight: Math.round(metrics.maxHeight)
|
||||
};
|
||||
}
|
||||
|
||||
async function handleSelect(modelId: string) {
|
||||
@@ -356,6 +293,14 @@
|
||||
|
||||
if (shouldCloseMenu) {
|
||||
closeMenu();
|
||||
|
||||
// Focus the chat textarea after model selection
|
||||
requestAnimationFrame(() => {
|
||||
const textarea = document.querySelector<HTMLTextAreaElement>(
|
||||
'[data-slot="chat-form"] textarea'
|
||||
);
|
||||
textarea?.focus();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -404,10 +349,7 @@
|
||||
}
|
||||
</script>
|
||||
|
||||
<svelte:window onresize={handleResize} />
|
||||
<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
|
||||
|
||||
<div class={cn('relative inline-flex flex-col items-end gap-1', className)} bind:this={container}>
|
||||
<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
|
||||
{#if loading && options.length === 0 && isRouter}
|
||||
<div class="flex items-center gap-2 text-xs text-muted-foreground">
|
||||
<Loader2 class="h-3.5 w-3.5 animate-spin" />
|
||||
@@ -418,9 +360,8 @@
|
||||
{:else}
|
||||
{@const selectedOption = getDisplayOption()}
|
||||
|
||||
<div class="relative">
|
||||
<button
|
||||
type="button"
|
||||
<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
|
||||
<Popover.Trigger
|
||||
class={cn(
|
||||
`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
|
||||
!isCurrentModelInCache()
|
||||
@@ -430,15 +371,11 @@
|
||||
: isHighlightedCurrentModelActive
|
||||
? 'text-foreground'
|
||||
: 'text-muted-foreground',
|
||||
isOpen ? 'text-foreground' : '',
|
||||
className
|
||||
isOpen ? 'text-foreground' : ''
|
||||
)}
|
||||
style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
|
||||
aria-haspopup={isRouter ? 'listbox' : undefined}
|
||||
aria-expanded={isRouter ? isOpen : undefined}
|
||||
onclick={toggleOpen}
|
||||
bind:this={triggerButton}
|
||||
disabled={disabled || updating}
|
||||
onclick={handleTriggerClick}
|
||||
disabled={disabled || updating || !isRouter}
|
||||
>
|
||||
<Package class="h-3.5 w-3.5" />
|
||||
|
||||
@@ -451,33 +388,35 @@
|
||||
{:else if isRouter}
|
||||
<ChevronDown class="h-3 w-3.5" />
|
||||
{/if}
|
||||
</button>
|
||||
</Popover.Trigger>
|
||||
|
||||
{#if isOpen && isRouter}
|
||||
<div
|
||||
bind:this={menuRef}
|
||||
use:portalToBody
|
||||
class={cn(
|
||||
'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
|
||||
menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
|
||||
)}
|
||||
role="listbox"
|
||||
style:top={menuPosition ? `${menuPosition.top}px` : undefined}
|
||||
style:left={menuPosition ? `${menuPosition.left}px` : undefined}
|
||||
style:width={menuPosition ? `${menuPosition.width}px` : undefined}
|
||||
data-placement={menuPosition?.placement ?? 'bottom'}
|
||||
>
|
||||
<Popover.Content
|
||||
class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
|
||||
align="end"
|
||||
sideOffset={8}
|
||||
collisionPadding={16}
|
||||
>
|
||||
<div class="flex max-h-[50dvh] flex-col overflow-hidden">
|
||||
<div
|
||||
class="overflow-y-auto py-1"
|
||||
style:max-height={menuPosition && menuPosition.maxHeight > 0
|
||||
? `${menuPosition.maxHeight}px`
|
||||
: undefined}
|
||||
class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
|
||||
>
|
||||
<SearchInput
|
||||
id="model-search"
|
||||
placeholder="Search models..."
|
||||
bind:value={searchTerm}
|
||||
bind:ref={searchInputRef}
|
||||
onClose={closeMenu}
|
||||
onKeyDown={handleSearchKeyDown}
|
||||
/>
|
||||
</div>
|
||||
<div
|
||||
class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
|
||||
>
|
||||
{#if !isCurrentModelInCache() && currentModel}
|
||||
<!-- Show unavailable model as first option (disabled) -->
|
||||
<button
|
||||
type="button"
|
||||
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-3 py-2 text-left text-sm text-red-400"
|
||||
class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
|
||||
role="option"
|
||||
aria-selected="true"
|
||||
aria-disabled="true"
|
||||
@@ -488,20 +427,25 @@
|
||||
</button>
|
||||
<div class="my-1 h-px bg-border"></div>
|
||||
{/if}
|
||||
{#each options as option (option.id)}
|
||||
{#if filteredOptions.length === 0}
|
||||
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
|
||||
{/if}
|
||||
{#each filteredOptions as option, index (option.id)}
|
||||
{@const status = getModelStatus(option.model)}
|
||||
{@const isLoaded = status === ServerModelStatus.LOADED}
|
||||
{@const isLoading = status === ServerModelStatus.LOADING}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isCompatible = isModelCompatible(option)}
|
||||
{@const isHighlighted = index === highlightedIndex}
|
||||
{@const missingModalities = getMissingModalities(option)}
|
||||
|
||||
<div
|
||||
class={cn(
|
||||
'group flex w-full items-center gap-2 px-3 py-2 text-left text-sm transition focus:outline-none',
|
||||
'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
|
||||
isCompatible
|
||||
? 'cursor-pointer hover:bg-muted focus:bg-muted'
|
||||
: 'cursor-not-allowed opacity-50',
|
||||
isSelected
|
||||
isSelected || isHighlighted
|
||||
? 'bg-accent text-accent-foreground'
|
||||
: isCompatible
|
||||
? 'hover:bg-accent hover:text-accent-foreground'
|
||||
@@ -509,10 +453,11 @@
|
||||
isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
|
||||
)}
|
||||
role="option"
|
||||
aria-selected={isSelected}
|
||||
aria-selected={isSelected || isHighlighted}
|
||||
aria-disabled={!isCompatible}
|
||||
tabindex={isCompatible ? 0 : -1}
|
||||
onclick={() => isCompatible && handleSelect(option.id)}
|
||||
onmouseenter={() => (highlightedIndex = index)}
|
||||
onkeydown={(e) => {
|
||||
if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
|
||||
e.preventDefault();
|
||||
@@ -586,8 +531,8 @@
|
||||
{/each}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
</Popover.Content>
|
||||
</Popover.Root>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
import Root from './popover.svelte';
|
||||
import Close from './popover-close.svelte';
|
||||
import Content from './popover-content.svelte';
|
||||
import Trigger from './popover-trigger.svelte';
|
||||
import Portal from './popover-portal.svelte';
|
||||
|
||||
export {
|
||||
Root,
|
||||
Content,
|
||||
Trigger,
|
||||
Close,
|
||||
Portal,
|
||||
//
|
||||
Root as Popover,
|
||||
Content as PopoverContent,
|
||||
Trigger as PopoverTrigger,
|
||||
Close as PopoverClose,
|
||||
Portal as PopoverPortal
|
||||
};
|
||||
@@ -0,0 +1,7 @@
|
||||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
|
||||
@@ -0,0 +1,37 @@
|
||||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
import PopoverPortal from './popover-portal.svelte';
|
||||
import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
|
||||
import type { ComponentProps } from 'svelte';
|
||||
|
||||
let {
|
||||
ref = $bindable(null),
|
||||
class: className,
|
||||
sideOffset = 4,
|
||||
side,
|
||||
align = 'center',
|
||||
collisionPadding = 8,
|
||||
avoidCollisions = true,
|
||||
portalProps,
|
||||
...restProps
|
||||
}: PopoverPrimitive.ContentProps & {
|
||||
portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
|
||||
} = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPortal {...portalProps}>
|
||||
<PopoverPrimitive.Content
|
||||
bind:ref
|
||||
data-slot="popover-content"
|
||||
{sideOffset}
|
||||
{side}
|
||||
{align}
|
||||
{collisionPadding}
|
||||
{avoidCollisions}
|
||||
class={cn(
|
||||
'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
|
||||
className
|
||||
)}
|
||||
{...restProps}
|
||||
/>
|
||||
</PopoverPortal>
|
||||
@@ -0,0 +1,7 @@
|
||||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let { ...restProps }: PopoverPrimitive.PortalProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Portal {...restProps} />
|
||||
@@ -0,0 +1,17 @@
|
||||
<script lang="ts">
|
||||
import { cn } from '$lib/components/ui/utils.js';
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let {
|
||||
ref = $bindable(null),
|
||||
class: className,
|
||||
...restProps
|
||||
}: PopoverPrimitive.TriggerProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Trigger
|
||||
bind:ref
|
||||
data-slot="popover-trigger"
|
||||
class={cn('', className)}
|
||||
{...restProps}
|
||||
/>
|
||||
@@ -0,0 +1,7 @@
|
||||
<script lang="ts">
|
||||
import { Popover as PopoverPrimitive } from 'bits-ui';
|
||||
|
||||
let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
|
||||
</script>
|
||||
|
||||
<PopoverPrimitive.Root bind:open {...restProps} />
|
||||
@@ -1,3 +1,2 @@
|
||||
export const VIEWPORT_GUTTER = 8;
|
||||
export const MENU_OFFSET = 6;
|
||||
export const MENU_MAX_WIDTH = 320;
|
||||
|
||||
@@ -295,14 +295,21 @@ class ModelsStore {
|
||||
* Fetch props for a specific model from /props endpoint
|
||||
* Uses caching to avoid redundant requests
|
||||
*
|
||||
* In ROUTER mode, this will only fetch props if the model is loaded,
|
||||
* since unloaded models return 400 from /props endpoint.
|
||||
*
|
||||
* @param modelId - Model identifier to fetch props for
|
||||
* @returns Props data or null if fetch failed
|
||||
* @returns Props data or null if fetch failed or model not loaded
|
||||
*/
|
||||
async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
|
||||
// Return cached props if available
|
||||
const cached = this.modelPropsCache.get(modelId);
|
||||
if (cached) return cached;
|
||||
|
||||
if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Avoid duplicate fetches
|
||||
if (this.modelPropsFetching.has(modelId)) return null;
|
||||
|
||||
|
||||
@@ -303,6 +303,27 @@ $$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases}
|
||||
expect(output).toBe(input); // Code blocks prevent misinterpretation
|
||||
});
|
||||
|
||||
test('preserves backslash parentheses in code blocks (GitHub issue)', () => {
|
||||
const input = '```python\nfoo = "\\(bar\\)"\n```';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
||||
expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
|
||||
});
|
||||
|
||||
test('preserves backslash brackets in code blocks', () => {
|
||||
const input = '```python\nfoo = "\\[bar\\]"\n```';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
||||
expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
|
||||
});
|
||||
|
||||
test('preserves backslash parentheses in inline code', () => {
|
||||
const input = 'Use `foo = "\\(bar\\)"` in your code.';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
||||
expect(output).toBe(input);
|
||||
});
|
||||
|
||||
test('escape backslash in mchem ce', () => {
|
||||
const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$';
|
||||
const output = preprocessLaTeX(input);
|
||||
|
||||
@@ -226,19 +226,16 @@ export function preprocessLaTeX(content: string): string {
|
||||
return expr;
|
||||
});
|
||||
|
||||
// Step 5: Restore code blocks
|
||||
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
|
||||
return codeBlocks[parseInt(index)];
|
||||
});
|
||||
|
||||
// Step 6: Apply additional escaping functions (brackets and mhchem)
|
||||
// Step 5: Apply additional escaping functions (brackets and mhchem)
|
||||
// This must happen BEFORE restoring code blocks to avoid affecting code content
|
||||
content = escapeBrackets(content);
|
||||
|
||||
if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
|
||||
content = escapeMhchem(content);
|
||||
}
|
||||
|
||||
// Final pass: Convert \(...\) → $...$, \[...\] → $$...$$
|
||||
// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
|
||||
// This must happen BEFORE restoring code blocks to avoid affecting code content
|
||||
content = content
|
||||
// Using the look‑behind pattern `(?<!\\)` we skip matches
|
||||
// that are preceded by a backslash, e.g.
|
||||
@@ -248,12 +245,18 @@ export function preprocessLaTeX(content: string): string {
|
||||
// Using the look‑behind pattern `(?<!\\)` we skip matches
|
||||
// that are preceded by a backslash, e.g. `\\[4pt]`.
|
||||
/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
|
||||
(_, prefix: string, content: string) => {
|
||||
return `${prefix}$$${content}$$`;
|
||||
(_, content: string) => {
|
||||
return `$$${content}$$`;
|
||||
}
|
||||
);
|
||||
|
||||
// Step 7: Restore blockquote markers
|
||||
// Step 7: Restore code blocks
|
||||
// This happens AFTER all LaTeX conversions to preserve code content
|
||||
content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
|
||||
return codeBlocks[parseInt(index)];
|
||||
});
|
||||
|
||||
// Step 8: Restore blockquote markers
|
||||
if (blockquoteMarkers.size > 0) {
|
||||
const finalLines = content.split('\n');
|
||||
const restoredLines = finalLines.map((line, index) => {
|
||||
|
||||
Vendored
+4
@@ -9,6 +9,10 @@ if (NOT MSVC)
|
||||
endif()
|
||||
|
||||
target_link_libraries (${TARGET} PRIVATE Threads::Threads)
|
||||
|
||||
if (WIN32 AND NOT MSVC)
|
||||
target_link_libraries(${TARGET} PUBLIC ws2_32)
|
||||
endif()
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
|
||||
Reference in New Issue
Block a user