Compare commits

..

10 Commits

Author SHA1 Message Date
Aman Gupta 8a4a856277 Add LLaDA 8b Diffusion model (#14771)
* Add support for Llada-8b: diffusion model

* Add README

* Fix README and convert_hf_to_gguf

* convert_hf_to_gguf.py: address review comments

* Make everything in a single example

* Remove model-specific sampling

* Remove unused argmax

* Remove braced initializers, improve README.md a bit

* Add diffusion specific gguf params in set_vocab, remove setting rope_theta and rms_norm_eps

* Remove adding the mask token

* Move add_add_bos_token to set_vocab

* use add_bool in gguf_writer.py
2025-07-31 19:49:09 +08:00
hipudding 11490b3672 CANN: Improve loading efficiency after converting weights to NZ format. (#14985)
* CANN: Improve loading efficiency after converting weights to NZ format.

* CANN: fix typo
2025-07-31 19:47:20 +08:00
compilade 66625a59a5 graph : reduce splits for recurrent and hybrid models (#14825)
* graph : avoid creating redundant s_copy views

* graph : comment the s_copy views
2025-07-31 08:02:46 +03:00
lhez 6e6725459a opencl: add mul_mat_f32_f32_l4_lm and mul_mat_f16_f32_l4_lm (#14809) 2025-07-30 14:56:55 -07:00
Ed Addario e9192bec56 quantize : fix using combined imatrix GGUFs (multiple datasets) (#14973) 2025-07-30 21:11:56 +02:00
Daniel Bevenius 41e78c567e server : add support for embd_normalize parameter (#14964)
This commit adds support for the `embd_normalize` parameter in the
server code.

The motivation for this is that currently if the server is started with
a pooling type that is not `none`, then Euclidean/L2 normalization will
be the normalization method used for embeddings. However, this is not
always the desired behavior, and users may want to use other
normalization (or none) and this commit allows that.

Example usage:
```console
curl --request POST \
    --url http://localhost:8080/embedding \
    --header "Content-Type: application/json" \
    --data '{"input": "Hello world today", "embd_normalize": -1}
```
2025-07-30 18:07:11 +02:00
uvos ad4a700117 HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and shapes (#14949) 2025-07-30 17:38:06 +02:00
Georgi Gerganov e32a4ec60e sync : ggml
ggml-ci
2025-07-30 17:33:11 +03:00
Kai Pastor e228de9449 cmake : Fix BLAS link interface (ggml/1316) 2025-07-30 17:33:11 +03:00
Kai Pastor 73a8e5ca03 vulkan : fix 32-bit builds (ggml/1313)
The pipeline member can be cast to VkPipeline.
This is a VkPipeline_T* on 64 bit but a uint64_t on 32 bit.
Cf. VK_DEFINE_NON_DISPATCHABLE_HANDLE documentation.
2025-07-30 17:33:11 +03:00
30 changed files with 1486 additions and 492 deletions
+34 -17
View File
@@ -3438,28 +3438,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
// diffusion parameters
add_opt(common_arg(
{ "--diffusion-steps" }, "N",
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
[](common_params & params, int value) { params.diffusion.steps = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-eps" }, "F",
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-algorithm" }, "N",
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
params.diffusion.algorithm),
[](common_params & params, int value) { params.diffusion.algorithm = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-alg-temp" }, "F",
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-visual" },
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
@@ -3467,5 +3450,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { params.diffusion.visual_mode = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-eps" }, "F",
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-algorithm" }, "N",
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
params.diffusion.algorithm),
[](common_params & params, int value) { params.diffusion.algorithm = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-alg-temp" }, "F",
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-block-length" }, "N",
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
[](common_params & params, int value) { params.diffusion.block_length = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-cfg-scale" }, "F",
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-add-gumbel-noise" }, "F",
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
return ctx_arg;
}
+11 -5
View File
@@ -220,11 +220,17 @@ struct common_params_vocoder {
};
struct common_params_diffusion {
int32_t steps = 64; // number of diffusion steps
float eps = 1e-3f; // epsilon for timesteps
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
float alg_temp = 0.0f; // algorithm temperature
bool visual_mode = false; // show progressive diffusion on screen
int32_t steps = 128;
bool visual_mode = false;
float eps = 0; // epsilon for timesteps
int32_t block_length = 32; // block length for generation
int32_t algorithm = 4; // default algorithm: low-confidence
float alg_temp = 0.0f; // algorithm temperature
float cfg_scale = 0; // classifier-free guidance scale
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
};
enum common_reasoning_format {
+101
View File
@@ -2904,6 +2904,107 @@ class DreamModel(TextModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("LLaDAModelLM")
class LLaDAModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLADA
undo_permute = True
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_dict = tokenizer.get_vocab()
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
assert max(vocab_dict.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
added_vocab = tokenizer.get_added_vocab()
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
# Check if it's a special token - treat special tokens as CONTROL tokens
if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
# Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
toktypes.append(gguf.TokenType.CONTROL)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
return tokens, toktypes, tokpre
def set_vocab(self):
self._set_vocab_gpt2()
# LLaDA specific parameters
self.gguf_writer.add_add_bos_token(True)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self._try_set_pooling_type()
# Add parameters similar to LlamaModel
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if (rope_dim := hparams.get("head_dim")) is None:
n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
self.gguf_writer.add_rope_dimension_count(rope_dim)
# Set context length for LLaDA
context_length = self.hparams.get("max_sequence_length", 4096)
self.gguf_writer.add_context_length(context_length)
# Set embedding length (dimension size)
embedding_length = self.hparams.get("d_model", 4096)
self.gguf_writer.add_embedding_length(embedding_length)
# Set feed forward length (MLP hidden size)
feed_forward_length = self.hparams.get("mlp_hidden_size", 12288)
self.gguf_writer.add_feed_forward_length(feed_forward_length)
# LLaDA models use non-causal attention for diffusion, similar to Dream
self.gguf_writer.add_causal_attention(False)
# LLaDA models don't shift their logits
self.gguf_writer.add_diffusion_shift_logits(False)
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
if self.undo_permute:
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LLaDAModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head)
# LLaDA model tensors should be mapped directly since it's the base model
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Ernie4_5_ForCausalLM")
class Ernie4_5Model(TextModel):
model_arch = gguf.MODEL_ARCH.ERNIE4_5
+4 -2
View File
@@ -310,5 +310,7 @@ Specifies the memory pool management strategy:
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
## TODO
- Support more models and data types.
### GGML_CANN_WEIGHT_NZ
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
+13
View File
@@ -0,0 +1,13 @@
# Diffusion Text Generation
This directory contains implementations for Diffusion LLMs (DLLMs)
More Info:
- https://github.com/ggml-org/llama.cpp/pull/14644
- https://github.com/ggml-org/llama.cpp/pull/14771
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
+539 -363
View File
@@ -5,344 +5,128 @@
#include "log.h"
#include <limits.h>
#include <string>
#include <vector>
#include <algorithm>
#include <cmath>
#include <cstring>
#include <limits>
#include <random>
#include <string>
#include <vector>
typedef bool (*diffusion_step_callback_t)(int32_t step,
int32_t total_steps,
const llama_token * tokens,
int32_t n_tokens,
void * user_data);
enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
enum diffusion_alg {
DIFFUSION_ALG_ORIGIN = 0,
DIFFUSION_ALG_MASKGIT_PLUS = 1,
DIFFUSION_ALG_TOPK_MARGIN = 2,
DIFFUSION_ALG_ENTROPY = 3,
// Unified transfer scheduling methods
enum transfer_schedule {
TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining
BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens
};
typedef bool (*diffusion_step_callback_t)(int32_t step,
int32_t total_steps,
const llama_token * tokens,
int32_t n_tokens,
void * user_data);
struct diffusion_params {
int32_t steps;
float eps;
float temperature;
float top_p;
int32_t top_k;
llama_token mask_token_id;
enum diffusion_alg algorithm;
float alg_temp;
diffusion_step_callback_t step_callback;
void * step_callback_user_data;
int32_t seed;
int32_t steps = 0;
float temperature = 0;
llama_token mask_token_id = LLAMA_TOKEN_NULL;
diffusion_step_callback_t step_callback = nullptr;
void * step_callback_user_data = nullptr;
int32_t seed = 0;
bool visual_mode = false;
bool shift_logits = false; // Shift logits by -1 after decode
float top_p = 0.;
int32_t top_k = 0.;
diffusion_algorithm algorithm = CONFIDENCE_BASED;
transfer_schedule schedule = TIMESTEP_BASED;
float cfg_scale = 0.; // Config scale for classifier-free guidance
float eps = 0.; // Timestep scheduling
int32_t block_length = 0; // Block size (for block scheduling)
float alg_temp = 0; // algorithm temperature (0.0 = deterministic)
bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0
int32_t max_length = 0; // Maximum sequence length
};
static diffusion_params diffusion_default_params() {
diffusion_params params = {};
params.steps = 64;
params.eps = 1e-3f;
params.temperature = 0.2f;
params.top_p = 0.95f;
params.top_k = 0;
params.mask_token_id = LLAMA_TOKEN_NULL;
params.algorithm = DIFFUSION_ALG_ORIGIN;
params.alg_temp = 0.0f;
params.step_callback = nullptr;
params.step_callback_user_data = nullptr;
params.seed = 0;
return params;
}
static void diffusion_generate(llama_context * ctx,
const llama_token * input_tokens,
llama_token * output_tokens,
int32_t n_input,
int32_t max_length,
struct diffusion_params params,
int32_t & n_generated) {
n_generated = 0;
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
return;
}
const llama_model * model = llama_get_model(ctx);
// Initialize with input and pad with mask tokens
std::copy(input_tokens, input_tokens + n_input, output_tokens);
std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
std::mt19937 rng(params.seed);
std::vector<float> timesteps(params.steps + 1);
for (int32_t i = 0; i <= params.steps; i++) {
timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
}
llama_set_causal_attn(ctx, false);
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
std::vector<llama_token_data> candidates(n_vocab);
std::vector<llama_token_data> conf_candidates;
conf_candidates.reserve(max_length);
std::vector<int32_t> mask_positions;
mask_positions.reserve(max_length);
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
if (params.top_k > 0) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
}
if (params.top_p < 1.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
}
if (params.temperature > 0.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
}
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
llama_batch batch = llama_batch_init(max_length, 0, 1);
batch.n_tokens = max_length;
int64_t total_sampling_time = 0;
int64_t total_time = 0;
int64_t time_start = ggml_time_us();
for (int32_t step = 0; step < params.steps; step++) {
if (params.step_callback) {
if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
break;
}
}
for (int32_t i = 0; i < max_length; i++) {
batch.token[i] = output_tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = 1;
}
int ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
break;
}
float * raw_logits = llama_get_logits(ctx);
if (!raw_logits) {
LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
break;
}
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
};
int64_t time_start_sampling = ggml_time_us();
mask_positions.clear();
for (int32_t i = 0; i < max_length; i++) {
if (output_tokens[i] == params.mask_token_id) {
mask_positions.push_back(i);
}
}
if (mask_positions.empty()) {
break;
}
float t = timesteps[step];
float s = timesteps[step + 1];
if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
for (int32_t pos : mask_positions) {
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].id = token_id;
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
}
llama_token_data_array cur_p = {
/* .data = */ candidates.data(),
/* .size = */ (size_t) n_vocab, // Reset size to full vocab
/* .selected = */ -1,
/* .sorted = */ false,
};
llama_sampler_apply(sampler, &cur_p);
output_tokens[pos] = cur_p.data[cur_p.selected].id;
}
}
} else {
std::vector<std::pair<float, int32_t>> confidences;
std::vector<llama_token> sampled_tokens(mask_positions.size());
for (size_t i = 0; i < mask_positions.size(); i++) {
int32_t pos = mask_positions[i];
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
candidates[token_id].id = token_id;
}
llama_token_data_array cur_p = {
/* .data = */ candidates.data(),
/* .size = */ candidates.size(),
/* .selected = */ -1,
/* .sorted = */ false,
};
llama_sampler_apply(sampler, &cur_p);
llama_token sampled_token = cur_p.data[cur_p.selected].id;
float confidence = 0.0f;
if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
const float epsilon = 1e-10f;
for (size_t j = 0; j < cur_p.size; j++) {
float prob = cur_p.data[j].p;
confidence += prob * logf(prob + epsilon);
}
} else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
confidence = cur_p.data[0].p - cur_p.data[1].p;
} else {
confidence = cur_p.data[cur_p.selected].p;
}
sampled_tokens[i] = sampled_token;
confidences.emplace_back(confidence, i);
}
int32_t num_transfer =
(step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
if (num_transfer > 0) {
if (params.alg_temp == 0.0f) {
std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
if (a.first != b.first) {
return a.first > b.first;
}
return a.second < b.second;
});
} else {
conf_candidates.clear();
for (int32_t pos = 0; pos < max_length; pos++) {
float conf_logit = -std::numeric_limits<float>::infinity();
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
if (it != mask_positions.end()) {
size_t mask_idx = std::distance(mask_positions.begin(), it);
conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling
}
conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
}
llama_token_data_array conf_array = {
/* .data = */ conf_candidates.data(),
/* .size = */ conf_candidates.size(),
/* .selected = */ -1,
/* .sorted = */ false,
};
for (int32_t i = 0; i < num_transfer; i++) {
// Apply distribution sampler to get selected index
llama_sampler_apply(dist_sampler, &conf_array);
int selected_idx = conf_array.selected;
confidences[i].second = conf_candidates[selected_idx].id;
conf_candidates[selected_idx].p = 0.0f;
conf_array.selected = -1;
}
}
if (params.alg_temp == 0.0f) {
// Deterministic - use confidence order
for (int32_t i = 0; i < num_transfer; i++) {
int32_t mask_idx = confidences[i].second;
int32_t pos = mask_positions[mask_idx];
llama_token token = sampled_tokens[mask_idx];
output_tokens[pos] = token;
}
} else {
for (int32_t i = 0; i < num_transfer; i++) {
int32_t pos = confidences[i].second;
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
if (it != mask_positions.end()) {
int32_t mask_idx = std::distance(mask_positions.begin(), it);
output_tokens[pos] = sampled_tokens[mask_idx];
}
}
}
}
}
int64_t time_end_sampling = ggml_time_us();
total_sampling_time += time_end_sampling - time_start_sampling;
}
int64_t time_end = ggml_time_us();
total_time += time_end - time_start;
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
llama_batch_free(batch);
llama_sampler_free(sampler);
llama_sampler_free(dist_sampler);
n_generated = max_length;
}
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
if (!use_chat_template) {
return prompt;
}
auto chat_templates = common_chat_templates_init(model, "");
common_chat_templates_inputs inputs;
common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;
inputs.add_generation_prompt = true;
inputs.messages.push_back(user_msg);
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
return result.prompt;
}
struct callback_data {
const common_params_diffusion * diff_params;
const llama_vocab * vocab;
int32_t n_input;
diffusion_params * diff_params;
const llama_vocab * vocab;
int32_t n_input;
};
static bool diffusion_step_callback(int32_t step,
int32_t total_steps,
static float calculate_confidence(const llama_token_data_array & cur_p,
diffusion_algorithm algorithm,
std::mt19937 & rng) {
switch (algorithm) {
case CONFIDENCE_BASED:
return cur_p.data[cur_p.selected].p; // Selected token probability
case ENTROPY_BASED:
{
float entropy = 0.0f;
const float epsilon = 1e-10f;
for (size_t i = 0; i < cur_p.size; i++) {
float prob = cur_p.data[i].p;
entropy += prob * logf(prob + epsilon);
}
return -entropy; // Higher entropy = lower confidence
}
case MARGIN_BASED:
return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
case RANDOM:
{
std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
return uniform(rng); // Random confidence
}
case ORIGIN:
return cur_p.data[cur_p.selected].p;
default:
return 0.0f;
}
}
// Unified transfer count calculation function
static int32_t calculate_transfer_count(int32_t step,
int32_t total_steps,
int32_t remaining_masked,
transfer_schedule schedule,
float eps,
const std::vector<int32_t> & num_transfer_tokens = {}) {
switch (schedule) {
case TIMESTEP_BASED:
{
float t = 1.0f - (float) step / total_steps * (1.0f - eps);
float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
return (int32_t) (remaining_masked * p_transfer);
}
case BLOCK_BASED:
if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
return num_transfer_tokens[step];
}
return remaining_masked / (total_steps - step); // Fallback
default:
return remaining_masked / (total_steps - step);
}
}
static bool diffusion_step_callback(int32_t step,
int32_t total_steps,
const llama_token * tokens,
int32_t n_tokens,
void * user_data) {
(void)user_data;
int32_t n_tokens,
void * user_data) {
(void) user_data;
callback_data * data = static_cast<callback_data *>(user_data);
@@ -350,11 +134,11 @@ static bool diffusion_step_callback(int32_t step,
int progress_percent = (step * 100) / total_steps;
int progress_bars = (step * 50) / total_steps;
LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
step,
total_steps,
std::string(progress_bars, '=').c_str(),
std::string(50 - progress_bars, ' ').c_str(),
progress_percent);
step,
total_steps,
std::string(progress_bars, '=').c_str(),
std::string(50 - progress_bars, ' ').c_str(),
progress_percent);
};
if (data->diff_params->visual_mode) {
@@ -391,6 +175,360 @@ static bool diffusion_step_callback(int32_t step,
return true;
}
static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
if (temperature == 0.0f) {
return;
}
std::uniform_real_distribution<double> uniform(0.0, 1.0);
for (int32_t i = 0; i < n_vocab; i++) {
double noise = uniform(rng);
// Prevent log(0)
noise = std::max(noise, 1e-20);
double gumbel_noise = std::pow(-std::log(noise), temperature);
logits[i] = std::exp(logits[i]) / gumbel_noise;
}
}
static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
std::vector<int32_t> num_transfer_tokens(steps);
int32_t base = mask_count / steps;
int32_t remainder = mask_count % steps;
for (int32_t i = 0; i < steps; i++) {
num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
}
return num_transfer_tokens;
}
static void diffusion_generate(llama_context * ctx,
const llama_token * input_tokens,
llama_token * output_tokens,
int32_t n_input,
const diffusion_params & params,
int32_t & n_generated) {
n_generated = 0;
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
return;
}
const llama_model * model = llama_get_model(ctx);
// Initialize with input and pad with mask tokens
std::copy(input_tokens, input_tokens + n_input, output_tokens);
std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
std::mt19937 rng(params.seed);
llama_set_causal_attn(ctx, false);
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
std::vector<llama_token_data> candidates(n_vocab);
std::vector<llama_token_data> conf_candidates;
conf_candidates.reserve(params.max_length);
std::vector<int32_t> mask_positions;
mask_positions.reserve(params.max_length);
// Setup sampler chain
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
if (params.top_k > 0) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
}
if (params.top_p < 1.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
}
if (params.temperature > 0.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
}
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
llama_batch batch = llama_batch_init(params.max_length, 0, 1);
batch.n_tokens = params.max_length;
// Pre-allocate buffers for CFG if needed
int32_t logits_size = n_vocab * params.max_length;
std::vector<float> cond_logits_buffer;
std::vector<llama_token> un_x_buffer;
if (params.cfg_scale > 0.0f) {
cond_logits_buffer.resize(logits_size);
un_x_buffer.resize(params.max_length);
}
// For block-based processing
std::vector<int32_t> num_transfer_tokens;
int32_t num_blocks = 1;
int32_t steps_per_block = params.steps;
if (params.schedule == BLOCK_BASED) {
GGML_ASSERT(params.max_length % params.block_length == 0);
num_blocks = params.max_length / params.block_length;
GGML_ASSERT(params.steps % num_blocks == 0);
steps_per_block = params.steps / num_blocks;
}
std::vector<float> confidence(params.max_length);
int64_t total_sampling_time = 0;
int64_t total_time = 0;
int64_t time_start = ggml_time_us();
for (int block_num = 0; block_num < num_blocks; block_num++) {
int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
int32_t block_end = (params.schedule == BLOCK_BASED) ?
std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
params.max_length;
// Count masked tokens in current block for block-based processing
if (params.schedule == BLOCK_BASED) {
int32_t block_mask_count = 0;
for (int i = block_start; i < block_end; i++) {
if (output_tokens[i] == params.mask_token_id) {
block_mask_count++;
}
}
num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
}
for (int32_t step = 0; step < steps_per_block; step++) {
int32_t global_step = block_num * steps_per_block + step;
if (params.step_callback) {
if (!params.step_callback(
global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
break;
}
}
// Setup batch
for (int32_t i = 0; i < params.max_length; i++) {
batch.token[i] = output_tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = 1;
}
float * logits = nullptr;
if (params.cfg_scale > 0.0f) {
int ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("Failed to generate conditional");
break;
}
float * cond_logits_ptr = llama_get_logits(ctx);
std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
// Unconditional generation (mask input)
std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
for (int32_t i = 0; i < n_input; i++) {
un_x_buffer[i] = params.mask_token_id;
}
for (int32_t i = 0; i < params.max_length; i++) {
batch.token[i] = un_x_buffer[i];
}
ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("Failed to generate unconditional");
break;
}
float * uncond_logits = llama_get_logits(ctx);
// Apply CFG
for (int32_t i = 0; i < logits_size; i++) {
cond_logits_buffer[i] =
uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
}
logits = cond_logits_buffer.data();
} else {
int ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
break;
}
logits = llama_get_logits(ctx);
}
if (!logits) {
LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
break;
}
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
if (params.shift_logits) {
return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
}
return logits + (pos) *n_vocab;
};
int64_t time_start_sampling = ggml_time_us();
mask_positions.clear();
for (int32_t i = 0; i < params.max_length; i++) {
if (output_tokens[i] == params.mask_token_id) {
// For block-based, only consider current block
if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
mask_positions.push_back(i);
}
}
}
if (mask_positions.empty()) {
break;
}
if (params.add_gumbel_noise && params.temperature > 0.0f) {
add_gumbel_noise(logits, n_vocab, params.temperature, rng);
}
if (params.algorithm == ORIGIN) {
int32_t transfer_count = calculate_transfer_count(
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
float p_transfer = (float) transfer_count / mask_positions.size();
for (int32_t pos : mask_positions) {
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].id = token_id;
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
}
llama_token_data_array cur_p = {
candidates.data(),
(size_t) n_vocab,
-1,
false,
};
llama_sampler_apply(sampler, &cur_p);
output_tokens[pos] = cur_p.data[cur_p.selected].id;
}
}
} else {
std::vector<std::pair<float, int32_t>> confidences;
std::vector<llama_token> sampled_tokens(mask_positions.size());
for (size_t i = 0; i < mask_positions.size(); i++) {
int32_t pos = mask_positions[i];
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
candidates[token_id].id = token_id;
}
llama_token_data_array cur_p = {
candidates.data(),
candidates.size(),
-1,
false,
};
llama_sampler_apply(sampler, &cur_p);
llama_token sampled_token = cur_p.data[cur_p.selected].id;
float conf = calculate_confidence(cur_p, params.algorithm, rng);
sampled_tokens[i] = sampled_token;
confidences.emplace_back(conf, i);
}
int32_t transfer_count = calculate_transfer_count(
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
if (transfer_count > 0) {
if (params.alg_temp == 0.0f) {
std::partial_sort(confidences.begin(),
confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
confidences.end(),
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
if (a.first != b.first) {
return a.first > b.first;
}
return a.second < b.second;
});
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
int32_t mask_idx = confidences[i].second;
int32_t pos = mask_positions[mask_idx];
output_tokens[pos] = sampled_tokens[mask_idx];
}
} else {
conf_candidates.clear();
for (size_t i = 0; i < confidences.size(); i++) {
float conf_logit = confidences[i].first / params.alg_temp;
conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
}
llama_token_data_array conf_array = {
conf_candidates.data(),
conf_candidates.size(),
-1,
false,
};
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
llama_sampler_apply(dist_sampler, &conf_array);
int32_t selected_idx = conf_array.selected;
int32_t mask_idx = selected_idx;
int32_t pos = mask_positions[mask_idx];
output_tokens[pos] = sampled_tokens[mask_idx];
conf_candidates[selected_idx].p = 0.0f;
conf_array.selected = -1;
}
}
}
}
int64_t time_end_sampling = ggml_time_us();
total_sampling_time += time_end_sampling - time_start_sampling;
}
}
int64_t time_end = ggml_time_us();
total_time += time_end - time_start;
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
total_time / 1000.0,
total_time / 1000.0 / params.steps,
total_sampling_time / 1000.0 / params.steps);
llama_batch_free(batch);
llama_sampler_free(sampler);
llama_sampler_free(dist_sampler);
n_generated = params.max_length;
}
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
if (!use_chat_template) {
return prompt;
}
auto chat_templates = common_chat_templates_init(model, "");
common_chat_templates_inputs inputs;
common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;
inputs.add_generation_prompt = true;
inputs.messages.push_back(user_msg);
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
return result.prompt;
}
int main(int argc, char ** argv) {
ggml_time_init();
@@ -400,11 +538,6 @@ int main(int argc, char ** argv) {
return 1;
}
const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
const char * alg_name = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
alg_names[params.diffusion.algorithm] :
"UNKNOWN";
common_init();
llama_backend_init();
@@ -421,6 +554,12 @@ int main(int argc, char ** argv) {
return 1;
}
if (!llama_model_is_diffusion(model)) {
LOG_ERR("error: unsupported model for diffusion");
llama_model_free(model);
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = params.n_ctx;
ctx_params.n_batch = params.n_batch;
@@ -442,10 +581,12 @@ int main(int argc, char ** argv) {
const llama_vocab * vocab = llama_model_get_vocab(model);
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
std::vector<llama_token> input_tokens = common_tokenize(vocab,
formatted_prompt,
/*add special tokens*/ true,
/*parse special*/ true);
int n_input = input_tokens.size();
int n_input = input_tokens.size();
if (n_input >= params.n_ctx) {
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
@@ -454,44 +595,79 @@ int main(int argc, char ** argv) {
return 1;
}
struct diffusion_params ldiff_params = diffusion_default_params();
ldiff_params.steps = params.diffusion.steps;
ldiff_params.eps = params.diffusion.eps;
ldiff_params.temperature = params.sampling.temp;
ldiff_params.top_p = params.sampling.top_p;
ldiff_params.top_k = params.sampling.top_k;
ldiff_params.algorithm = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
ldiff_params.alg_temp = params.diffusion.alg_temp;
ldiff_params.seed = params.sampling.seed;
llama_token mask_token_id = llama_vocab_mask(vocab);
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps);
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps);
LOG_INF("diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion.algorithm,
alg_name);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp);
ldiff_params.mask_token_id = mask_token_id;
callback_data cb_data = { &params.diffusion, vocab, n_input };
ldiff_params.step_callback = diffusion_step_callback;
ldiff_params.step_callback_user_data = &cb_data;
int32_t n_generated = 0;
bool visual_mode = params.diffusion.visual_mode;
int32_t n_generated = 0;
std::vector<llama_token> output_tokens(params.n_ubatch);
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
ldiff_params, n_generated);
struct diffusion_params diff_params;
char shift_logits_str[8];
if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) {
diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0);
} else {
diff_params.shift_logits = true;
}
//Use either eps or block length, but not both
GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
if (params.diffusion.eps) {
diff_params.schedule = TIMESTEP_BASED;
diff_params.eps = params.diffusion.eps;
} else if (params.diffusion.block_length) {
diff_params.schedule = BLOCK_BASED;
diff_params.block_length = params.diffusion.block_length;
}
diff_params.mask_token_id = mask_token_id;
diff_params.seed = params.sampling.seed;
diff_params.temperature = params.sampling.temp;
diff_params.steps = params.diffusion.steps;
diff_params.algorithm = static_cast<diffusion_algorithm>(params.diffusion.algorithm);
diff_params.max_length = params.n_ubatch;
diff_params.top_p = params.sampling.top_p;
diff_params.top_k = params.sampling.top_k;
diff_params.visual_mode = params.diffusion.visual_mode;
diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise;
diff_params.step_callback = diffusion_step_callback;
callback_data cb_data = { &diff_params, vocab, n_input };
diff_params.step_callback_user_data = &cb_data;
const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
const char * alg_name =
(diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
const char * sched_name =
(diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN";
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps);
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length);
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature);
if (diff_params.schedule == TIMESTEP_BASED) {
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp);
}
if (diff_params.schedule == BLOCK_BASED) {
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale);
}
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated);
if (n_generated > 0) {
if (params.diffusion.visual_mode) {
if (visual_mode) {
//clear screen and move cursor to top-left
LOG_INF("\033[2J\033[H");
}
output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
std::string output_data = common_detokenize(vocab, output_tokens, false);
LOG_INF("\n%s\n", output_data.c_str());
+2 -2
View File
@@ -34,8 +34,8 @@ if (NOT GGML_SHARED_LIB)
if (GGML_BLAS)
find_dependency(BLAS)
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
list(APPEND GGML_BLAS_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
list(APPEND GGML_BLAS_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
endif()
if (GGML_CUDA)
+3 -5
View File
@@ -1913,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
bcast_weight_nb[4], bcast_weight_nb[5]};
aclTensor* acl_weight_tensor;
bool weightToNZ = false;
#ifdef ASCEND_310P
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
#endif
if (weightToNZ && is_matmul_weight(weight)) {
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
if (weight_to_nz && is_matmul_weight(weight)) {
int64_t acl_stride[2] = {1, transpose_ne[1]};
// Reverse ne.
+63 -51
View File
@@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
return GGML_STATUS_SUCCESS;
}
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
uint64_t size = 1;
for (auto i : shape) {
size *= i;
// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
namespace {
void* g_nz_workspace = nullptr;
size_t g_nz_workspace_allocated = 0;
void release_nz_workspace() {
if (g_nz_workspace) {
aclrtFree(g_nz_workspace);
g_nz_workspace = nullptr;
g_nz_workspace_allocated = 0;
}
}
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
size *= sizeof(int16_t);
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
void relloc_nz_workspace(size_t new_size) {
if (new_size > g_nz_workspace_allocated) {
if (g_nz_workspace) {
aclrtFree(g_nz_workspace);
g_nz_workspace = nullptr;
}
ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
g_nz_workspace_allocated = new_size;
}
}
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
shape.data(), shape.size(), *deviceAddr);
return 0;
}
/**
* @brief Convert tensor weights to NZ format using Ascend CANN API.
*
* This function creates a transposed tensor descriptor and performs the
* TransMatmulWeight operation. Converting tensor formats can significantly
* improve performance on certain hardware.
*
* @param tensor Pointer to the input ggml_tensor containing the weights.
* @param data Pointer to the raw data buffer for the tensor weights.
* @param offset Byte offset within the tensor data buffer where weights start.
*
* @note The workspace buffer used in this function is managed globally and reused
* across calls. This reduces overhead from repeated memory allocation and deallocation.
*/
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
aclrtStream stream;
ACL_CHECK(aclrtCreateStream(&stream));
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
void *weightTransposedDeviceAddr = nullptr;
aclTensor *weightTransposed = nullptr;
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
ggml_cann_type_mapping(tensor->type), &weightTransposed);
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
tensor->nb, 2, ACL_FORMAT_ND, offset);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
void *workspaceAddr = nullptr;
// TransMatmulWeight
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
if (workspaceSize > 0) {
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
workspaceAddrPtrTrans.reset(workspaceAddr);
}
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
&workspaceSize, &executor));
// Avoid frequent malloc/free of the workspace.
relloc_nz_workspace(workspaceSize);
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
aclrtMemcpy((char *)tensor->data + offset, size,
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
ACL_CHECK(aclDestroyTensor(weightTransposed));
aclrtFree(weightTransposedDeviceAddr);
}
// TODO: need handle tensor which has paddings.
@@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor(
// For acl, synchronous functions use this default stream.
// Why aclrtSynchronizeDevice?
bool weightToNZ = false;
#ifdef ASCEND_310P
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
#endif
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE));
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
weight_format_to_nz(tensor, data, offset);
}
} else {
@@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0];
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
// last line must bigger than 32, because every single op deal at
// least 32 bytes.
// TODO: quantized type?
// int64_t line_size = ne0 * ggml_element_size(tensor);
// int64_t line_size_align_32 = (line_size + 31) & ~31;
// size += (line_size_align_32 - line_size);
// TODO: not support quantized yet.
// TODO: consider un-continue tensor.
if (ggml_is_quantized(tensor->type)) {
if (ne0 % MATRIX_ROW_PADDING != 0) {
size += ggml_row_size(
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
// NZ format weight are not support quantized yet.
// If ND tensor transform to NZ, size may changed.
int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
size_t new_size;
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
ggml_cann_type_mapping(tensor->type), &new_size));
ACL_CHECK(aclDestroyIntArray(acl_shape));
size = std::max(size, new_size);
}
return size;
@@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
(ggml_backend_cann_context*)backend->context;
ggml_cann_set_device(cann_ctx->device);
//release temp buffer create by set tensor.
release_nz_workspace();
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor* node = cgraph->nodes[i];
+3 -4
View File
@@ -227,9 +227,9 @@ typedef float2 dfloat2;
#define FP16_MMA_AVAILABLE
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
#define AMD_MFMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
#define NEW_MMA_AVAILABLE
@@ -293,10 +293,9 @@ static bool fp32_mma_hardware_available(const int cc) {
return GGML_CUDA_CC_IS_CDNA(cc);
}
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
static bool amd_mfma_available(const int cc) {
#if !defined(GGML_HIP_NO_MMQ_MFMA)
return GGML_CUDA_CC_IS_CDNA3(cc);
return GGML_CUDA_CC_IS_CDNA(cc);
#else
return false;
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
+20 -4
View File
@@ -109,8 +109,8 @@ void ggml_cuda_mul_mat_q(
const int64_t s03 = src0->nb[3] / ts_src0;
const int64_t s3 = dst->nb[3] / ts_dst;
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|| GGML_CUDA_CC_IS_CDNA(cc);
if (!ids) {
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -252,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
|| GGML_CUDA_CC_IS_CDNA(cc))
&& src1_ncols == ne11;
const mmq_args args = {
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
@@ -306,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return false;
}
if (new_mma_available(cc) || amd_mfma_available(cc)) {
if (new_mma_available(cc)) {
return true;
}
@@ -322,5 +322,21 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
if (amd_mfma_available(cc)) {
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
// performs better but is currently suffering from a crash on this architecture.
// TODO: Revisit when hipblaslt is fixed on CDNA3
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
return true;
}
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
return true;
}
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
return true;
}
return false;
}
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
+2 -2
View File
@@ -3096,8 +3096,8 @@ static __global__ void mul_mat_q(
}
__syncthreads();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
{
const int wt = blockIdx.z / nchannels_y;
const int zt = blockIdx.z - wt*nchannels_y;
+2
View File
@@ -82,6 +82,8 @@ set(GGML_OPENCL_KERNELS
mul_mv_q4_0_f32_1d_16x_flat
mul_mv_q6_k
mul_mv_id_q4_0_f32_8x_flat
mul_mm_f32_f32_l4_lm
mul_mm_f16_f32_l4_lm
mul
norm
relu
+132 -12
View File
@@ -33,6 +33,7 @@
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
#define UNUSED(x) (void)(x)
@@ -396,6 +397,8 @@ struct ggml_backend_opencl_context {
cl_program program_conv_2d_f16_f32;
cl_program program_tsembd;
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
cl_program program_mul_mm_f32_f32_l4_lm;
cl_program program_mul_mm_f16_f32_l4_lm;
cl_kernel kernel_add, kernel_add_row;
cl_kernel kernel_mul, kernel_mul_row;
@@ -450,6 +453,8 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_conv_2d_f16_f32;
cl_kernel kernel_timestep_embedding;
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
std::vector<ProfilingInfo> profiling_info;
@@ -1040,6 +1045,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mm_f32_f32_l4_lm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mm_f32_f32_l4_lm.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mm_f32_f32_l4_lm.cl");
#endif
backend_ctx->program_mul_mm_f32_f32_l4_lm =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mm_f32_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f32_f32_l4_lm, "kernel_mul_mm_f32_f32_l4_lm", &err), err));
GGML_LOG_CONT(".");
}
// mul_mm_f16_f32_l4_lm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mm_f16_f32_l4_lm.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mm_f16_f32_l4_lm.cl");
#endif
backend_ctx->program_mul_mm_f16_f32_l4_lm =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_l4_lm, "kernel_mul_mm_f16_f32_l4_lm", &err), err));
GGML_LOG_CONT(".");
}
// mul
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -5297,18 +5334,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
src0->ne[1] > 32 && // M > 32
src1->ne[1] > 32 && // N > 32
src0->ne[0] > 32 && // K > 32
src0->ne[2] == 1 && src0->ne[3] == 1 &&
src1->ne[2] == 1 && src1->ne[3] == 1 &&
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
return;
}
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5655,6 +5680,101 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
} // if (ne01 && ne1)
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
// GEMM using local memory
// Current BK = 16, so ne00 % 16 == 0
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
src1t == GGML_TYPE_F32 &&
ne00 % 16 == 0 &&
ne11 > 1) {
switch(src0t) {
case GGML_TYPE_F32: {
kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm;
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
int batch_stride_a = ne00*ne01;
int batch_stride_b = ne10*ne11;
int batch_stride_d = ne0*ne1;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
return;
}
case GGML_TYPE_F16: {
kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
int batch_stride_a = ne00*ne01;
int batch_stride_b = ne10*ne11;
int batch_stride_d = ne0*ne1;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
return;
}
default:
break;
}
}
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
src0->ne[1] > 32 && // M > 32
src1->ne[1] > 32 && // N > 32
src0->ne[0] > 32 && // K > 32
src0->ne[2] == 1 && src0->ne[3] == 1 &&
src1->ne[2] == 1 && src1->ne[3] == 1 &&
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
return;
}
if (!ggml_is_transposed(src0) &&
!ggml_is_transposed(src1) &&
src1t == GGML_TYPE_F32 &&
@@ -0,0 +1,132 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define LOAD_VEC_A 4
#define LOAD_VEC_B 4
#define BM 64
#define BN 64
#define BK 16
#define TM 4
#define TN 8
kernel void kernel_mul_mm_f16_f32_l4_lm(
global half4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne11,
int ne12,
int stride_a,
int stride_b,
int stride_d,
int batch_stride_a,
int batch_stride_b,
int batch_stride_d,
int r2,
int r3
) {
src0 = (global half4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
local half buf_a[BM * BK];
local float buf_b[BN * BK];
const int batch_idx = get_global_id(2);
const int i13 = batch_idx / ne12;
const int i12 = batch_idx % ne12;
const int i03 = i13 / r3;
const int i02 = i12 / r2;
const int batch_idx_a = i03 * ne02 + i02;
const int ir = get_group_id(0);
const int ic = get_group_id(1);
const int tid = get_local_id(0);
const int th_r = tid % (BM / TM);
const int th_c = tid / (BM / TM);
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
float sums[TM * TN];
half cache_a[TM];
float cache_b[TN];
for (int i = 0; i < TM * TN; i++) {
sums[i] = 0.0f;
}
for (int block = 0; block < ne00; block += BK) {
for (int l = 0; l < BM; l += loadstride_a) {
const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
}
for (int l = 0; l < BN; l += loadstride_b) {
const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
}
barrier(CLK_LOCAL_MEM_FENCE);
pos_a += BK / LOAD_VEC_A;
pos_b += BK / LOAD_VEC_B;
for (int i = 0; i < BK; i++) {
for (int j = 0; j < TM; j++) {
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
}
for (int j = 0; j < TN; j++) {
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
}
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
const int sums_idx = cc*TM + cr;
sums[sums_idx] = mad(convert_float(cache_a[cr]), cache_b[cc], sums[sums_idx]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int dr = ir * BM + th_r * TM;
const int dc = ic * BN + th_c * TN;
const int offsets = batch_idx * batch_stride_d;
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
if (dr + cr < ne01 && dc + cc < ne11) {
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
}
}
}
}
@@ -0,0 +1,133 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define LOAD_VEC_A 4
#define LOAD_VEC_B 4
#define BM 64
#define BN 64
#define BK 16
#define TM 4
#define TN 8
kernel void kernel_mul_mm_f32_f32_l4_lm(
global float4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne11,
int ne12,
int stride_a,
int stride_b,
int stride_d,
int batch_stride_a,
int batch_stride_b,
int batch_stride_d,
int r2,
int r3
) {
src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
local float buf_a[BM * BK];
local float buf_b[BN * BK];
const int batch_idx = get_global_id(2);
const int i13 = batch_idx / ne12;
const int i12 = batch_idx % ne12;
const int i03 = i13 / r3;
const int i02 = i12 / r2;
const int batch_idx_a = i03 * ne02 + i02;
const int ir = get_group_id(0);
const int ic = get_group_id(1);
const int tid = get_local_id(0);
const int th_r = tid % (BM / TM);
const int th_c = tid / (BM / TM);
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
float sums[TM * TN];
float cache_a[TM];
float cache_b[TN];
for (int i = 0; i < TM * TN; i++) {
sums[i] = 0.0f;
}
for (int block = 0; block < ne00; block += BK) {
for (int l = 0; l < BM; l += loadstride_a) {
const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
}
for (int l = 0; l < BN; l += loadstride_b) {
const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
}
barrier(CLK_LOCAL_MEM_FENCE);
pos_a += BK / LOAD_VEC_A;
pos_b += BK / LOAD_VEC_B;
for (int i = 0; i < BK; i++) {
for (int j = 0; j < TM; j++) {
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
}
for (int j = 0; j < TN; j++) {
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
}
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
const int sums_idx = cc*TM + cr;
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int dr = ir * BM + th_r * TM;
const int dc = ic * BN + th_c * TN;
const int offsets = batch_idx * batch_stride_d;
for (int cc = 0; cc < TN; cc++) {
for (int cr = 0; cr < TM; cr++) {
if (dr + cr < ne01 && dc + cc < ne11) {
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
}
}
}
}
+1 -1
View File
@@ -1341,7 +1341,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
vk::DebugUtilsObjectNameInfoEXT duoni;
duoni.objectType = vk::ObjectType::ePipeline;
duoni.pObjectName = pipeline->name.c_str();
duoni.objectHandle = reinterpret_cast<uint64_t>(static_cast<VkPipeline_T*>(pipeline->pipeline));
duoni.objectHandle = /*reinterpret_cast*/(uint64_t)(static_cast<VkPipeline>(pipeline->pipeline));
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
}
+20
View File
@@ -279,6 +279,9 @@ class Keys:
class Projector:
STACK_FACTOR = "clip.audio.projector.stack_factor"
class Diffusion:
SHIFT_LOGITS = "diffusion.shift_logits"
#
# recommended mapping of model tensor names for storage in gguf
#
@@ -377,6 +380,7 @@ class MODEL_ARCH(IntEnum):
LFM2 = auto()
DREAM = auto()
SMALLTHINKER = auto()
LLADA = auto()
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -697,6 +701,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LFM2: "lfm2",
MODEL_ARCH.DREAM: "dream",
MODEL_ARCH.SMALLTHINKER: "smallthinker",
MODEL_ARCH.LLADA: "llada",
}
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -1318,6 +1323,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.LLADA: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.QWEN2VL: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
+5
View File
@@ -1047,6 +1047,11 @@ class GGUFWriter:
def add_audio_stack_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
# diffusion models
def add_diffusion_shift_logits(self, value: bool) -> None:
self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value)
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
pack_prefix = ''
if not skip_pack_prefix:
+12
View File
@@ -32,6 +32,7 @@ class TensorNameMap:
"model.word_embeddings", # bailingmoe
"language_model.model.embed_tokens", # llama4
"encoder", # neobert
"model.transformer.wte", # llada
),
# Token type embeddings
@@ -71,6 +72,7 @@ class TensorNameMap:
"head", # rwkv
"head.out", # wavtokenizer
"lm_head", # llama4
"model.transformer.ff_out", # llada
),
# Output norm
@@ -94,6 +96,7 @@ class TensorNameMap:
"model.ln_out", # rwkv7
"backbone.final_layer_norm", # wavtokenizer
"model.norm", # llama4
"model.transformer.ln_f", # llada
),
# Rope frequencies
@@ -139,6 +142,7 @@ class TensorNameMap:
"model.layers.{bid}.input_layernorm", # llama4
"transformer_encoder.{bid}.attention_norm", # neobert
"model.layers.{bid}.operator_norm", # lfm2
"model.transformer.blocks.{bid}.attn_norm", # llada
),
# Attention norm 2
@@ -183,6 +187,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
"transformer.h.{bid}.attn.attention.q_proj", # exaone
"model.layers.{bid}.self_attn.q_proj", # llama4
"model.transformer.blocks.{bid}.q_proj", # llada
),
# Attention key
@@ -199,6 +204,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
"transformer.h.{bid}.attn.attention.k_proj", # exaone
"model.layers.{bid}.self_attn.k_proj", # llama4
"model.transformer.blocks.{bid}.k_proj", # llada
),
# Attention value
@@ -214,6 +220,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
"transformer.h.{bid}.attn.attention.v_proj", # exaone
"model.layers.{bid}.self_attn.v_proj", # llama4
"model.transformer.blocks.{bid}.v_proj", # llada
),
# Attention output
@@ -246,6 +253,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.attention.out_proj", # exaone
"model.layers.{bid}.self_attn.o_proj", # llama4
"transformer_encoder.{bid}.wo", # neobert
"model.transformer.blocks.{bid}.attn_out", # llada
),
# Attention output norm
@@ -291,6 +299,7 @@ class TensorNameMap:
"model.layers.{bid}.post_attention_layernorm", # llama4
"transformer_encoder.{bid}.ffn_norm", # neobert
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
"model.transformer.blocks.{bid}.ff_norm", # llada
),
# Post feed-forward norm
@@ -364,6 +373,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
"transformer_encoder.{bid}.ffn.w12", # neobert
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
"model.transformer.blocks.{bid}.up_proj", # llada
),
MODEL_TENSOR.FFN_UP_EXP: (
@@ -405,6 +415,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.c_fc_0", # exaone
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
"model.transformer.blocks.{bid}.ff_proj", # llada
),
MODEL_TENSOR.FFN_GATE_EXP: (
@@ -454,6 +465,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
"transformer_encoder.{bid}.ffn.w3", # neobert
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
"model.transformer.blocks.{bid}.ff_out", # llada
),
MODEL_TENSOR.FFN_DOWN_EXP: (
+3
View File
@@ -537,6 +537,9 @@ extern "C" {
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
// Returns 0 on success
LLAMA_API uint32_t llama_model_quantize(
const char * fname_inp,
+1 -1
View File
@@ -1 +1 @@
b7bfde9c88aa4b063ce68dab6cc4f5c6caae37fd
daf7906728036a82f20c69fcbd74b6f536c74d3f
+19
View File
@@ -89,6 +89,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LFM2, "lfm2" },
{ LLM_ARCH_DREAM, "dream" },
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -1972,6 +1973,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_LLADA,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_UNKNOWN,
{
@@ -2224,6 +2242,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
bool llm_arch_is_diffusion(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
return true;
default:
return false;
+1
View File
@@ -93,6 +93,7 @@ enum llm_arch {
LLM_ARCH_LFM2,
LLM_ARCH_DREAM,
LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA,
LLM_ARCH_UNKNOWN,
};
+23 -15
View File
@@ -1644,16 +1644,17 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
ggml_tensor * llm_graph_context::build_rs(
ggml_tensor * s,
ggml_tensor * state_copy,
ggml_tensor * state_copy_main,
ggml_tensor * state_copy_extra,
int32_t state_size,
int32_t n_seqs,
uint32_t n_kv,
uint32_t kv_head,
uint32_t kv_size,
uint32_t n_rs,
uint32_t rs_head,
uint32_t rs_size,
int32_t rs_zero,
const llm_graph_get_rows_fn & get_state_rows) const {
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_size);
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
// Clear a single state which will then be copied to the other cleared states.
// Note that this is a no-op when the view is zero-sized.
@@ -1661,39 +1662,44 @@ ggml_tensor * llm_graph_context::build_rs(
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
// copy states
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
// {state_size, kv_size} -> {state_size, n_seqs}
ggml_tensor * output_states = get_state_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
// NOTE: assuming the copy destinations are ALL contained between rs_head and rs_head + n_rs
// {state_size, rs_size} -> {state_size, n_seqs}
ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
ggml_build_forward_expand(gf, output_states);
// copy extra states which won't be changed further (between n_seqs and n_kv)
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
// copy extra states which won't be changed further (between n_seqs and n_rs)
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
ggml_build_forward_expand(gf,
ggml_cpy(ctx0,
states_extra,
ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
return output_states;
}
static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
ggml_context * ctx0,
const llama_ubatch & ubatch,
const llama_memory_recurrent_context * mctx_cur) {
auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
const auto n_rs = mctx_cur->get_n_rs();
const int64_t n_rs = mctx_cur->get_n_rs();
const int64_t n_seqs = ubatch.n_seqs;
inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
ggml_set_input(inp->s_copy);
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
return inp;
}
llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
auto inp = build_rs_inp_impl(ctx0, mctx_cur);
auto inp = build_rs_inp_impl(ctx0, ubatch, mctx_cur);
return (llm_graph_input_rs *) res->add_input(std::move(inp));
}
@@ -1706,7 +1712,9 @@ ggml_tensor * llm_graph_context::build_rs(
const llm_graph_get_rows_fn & get_state_rows) const {
const auto * kv_state = inp->mctx;
return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
return build_rs(s, inp->s_copy_main, inp->s_copy_extra, state_size, n_seqs,
kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(),
get_state_rows);
}
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
@@ -1753,7 +1761,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
auto inp_rs = build_rs_inp_impl(ctx0, mctx_cur->get_recr());
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
+11 -6
View File
@@ -214,7 +214,12 @@ public:
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * s_copy; // I32 [kv_size]
ggml_tensor * s_copy; // I32 [n_rs]
// views of s_copy, computed once per graph
// and shared across layers which use build_rs
ggml_tensor * s_copy_main; // I32 [n_seqs]
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
const llama_memory_recurrent_context * mctx;
};
@@ -730,7 +735,6 @@ struct llm_graph_context {
// recurrent
//
// TODO: avoid notion of "kv"
// TODO: move this implementation to llama_memory_recurrent.
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
@@ -738,12 +742,13 @@ struct llm_graph_context {
// `llama_memory_recurrent`
ggml_tensor * build_rs(
ggml_tensor * s,
ggml_tensor * state_copy,
ggml_tensor * state_copy_main,
ggml_tensor * state_copy_extra,
int32_t state_size,
int32_t n_seqs,
uint32_t n_kv,
uint32_t kv_head,
uint32_t kv_size,
uint32_t n_rs,
uint32_t rs_head,
uint32_t rs_size,
int32_t rs_zero,
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
+173
View File
@@ -869,6 +869,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.causal_attn = false;
}
break;
case LLM_ARCH_LLADA:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
// LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
switch (hparams.n_layer) {
case 32:
type = LLM_TYPE_8B;
break;
default:
type = LLM_TYPE_UNKNOWN;
}
// Set non-causal attention for diffusion models
hparams.causal_attn = false;
}
break;
case LLM_ARCH_QWEN2MOE:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -2149,6 +2164,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
case LLM_ARCH_LLADA:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output =
create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
// Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
layer.wq =
create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
layer.wo =
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
// optional MLP bias
layer.ffn_gate_b =
create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
layer.ffn_down_b =
create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
}
}
break;
case LLM_ARCH_LLAMA4:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8042,6 +8104,106 @@ struct llm_build_dream : public llm_graph_context {
}
};
struct llm_build_llada : public llm_graph_context {
llm_build_llada(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
// Non-causal attention for diffusion
auto * inp_attn = build_attn_inp_no_cache();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_qwen2vl : public llm_graph_context {
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17201,6 +17363,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
{
res = nullptr;
} break;
@@ -17367,6 +17530,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
llm = std::make_unique<llm_build_dream>(*this, params);
}
break;
case LLM_ARCH_LLADA:
{
llm = std::make_unique<llm_build_llada>(*this, params);
}
break;
case LLM_ARCH_QWEN2VL:
{
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -17765,6 +17933,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLADA:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
@@ -17943,6 +18112,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
return llm_arch_is_recurrent(model->arch);
}
bool llama_model_is_diffusion(const llama_model * model) {
return llm_arch_is_diffusion(model->arch);
}
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
return model->tensors_by_name;
}
+1 -1
View File
@@ -311,7 +311,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx);
imatrix_datasets.reserve(n_datasets);
for (int64_t i = 0; i < n_datasets; ++i) {
imatrix_datasets.push_back(gguf_get_val_str(ctx_gguf, dataset_idx));
imatrix_datasets.push_back(gguf_get_arr_str(ctx_gguf, dataset_idx, i));
}
printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str());
for (size_t i = 1; i < imatrix_datasets.size(); ++i) {
+9
View File
@@ -644,6 +644,15 @@ The same as [the embedding example](../embedding) does.
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`embd_normalize`: Normalization for pooled embeddings. Can be one of the following values:
```
-1: No normalization
0: Max absolute
1: Taxicab
2: Euclidean/L2
>2: P-Norm
```
### POST `/reranking`: Rerank documents according to a given query
Similar to https://jina.ai/reranker/ but might change in the future.
+13 -1
View File
@@ -138,6 +138,9 @@ struct slot_params {
std::string oaicompat_cmpl_id;
common_chat_syntax oaicompat_chat_syntax;
// Embeddings
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
json to_json() const {
std::vector<std::string> samplers;
samplers.reserve(sampling.samplers.size());
@@ -2601,7 +2604,7 @@ struct server_context {
// normalize only when there is pooling
if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
common_embd_normalize(embd, embd_res.data(), n_embd, 2);
common_embd_normalize(embd, embd_res.data(), n_embd, slot.params.embd_normalize);
res->embedding.push_back(embd_res);
break;
} else {
@@ -4614,6 +4617,14 @@ int main(int argc, char ** argv) {
}
}
int embd_normalize = 2; // default to Euclidean/L2 norm
if (body.count("embd_normalize") != 0) {
embd_normalize = body.at("embd_normalize");
if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
}
}
// create and queue the task
json responses = json::array();
bool error = false;
@@ -4629,6 +4640,7 @@ int main(int argc, char ** argv) {
// OAI-compat
task.params.oaicompat = oaicompat;
task.params.embd_normalize = embd_normalize;
tasks.push_back(std::move(task));
}