mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
25 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0e70ba686e | |||
| 46828872c3 | |||
| 6b064c92b4 | |||
| 4da69d1abd | |||
| d62b532c52 | |||
| 081b29bd2a | |||
| 5437d4aaf5 | |||
| 78f766768d | |||
| 8dd19a4812 | |||
| 130d0c90bd | |||
| 3919da8e33 | |||
| 0006f5a74a | |||
| 05c3a444b8 | |||
| 382bc7f2e8 | |||
| 4f51968aca | |||
| 227d7c5a7f | |||
| 7b1ec53f56 | |||
| 160bc039c8 | |||
| 08ea539df2 | |||
| 644fd71b44 | |||
| 4ddd199f6f | |||
| a0974156f3 | |||
| 87cf323cef | |||
| 5478bbcd17 | |||
| b5ae1ddff9 |
@@ -317,7 +317,7 @@ jobs:
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential vulkan-sdk
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -327,6 +327,12 @@ jobs:
|
||||
cmake -DGGML_VULKAN=ON ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.0.2
|
||||
|
||||
@@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
||||
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
||||
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
||||
|
||||
#### Multimodal
|
||||
|
||||
@@ -220,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
|
||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||
| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
|
||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
|
||||
@@ -413,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
|
||||
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
||||
|
||||
## [`llama-bench`](example/bench)
|
||||
## [`llama-bench`](examples/llama-bench)
|
||||
|
||||
#### Benchmark the performance of the inference for various parameters.
|
||||
|
||||
|
||||
+6
-7
@@ -855,13 +855,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sampling.ignore_eos = true;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--penalize-nl"},
|
||||
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.sampling.penalize_nl = true;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--temp"}, "N",
|
||||
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
|
||||
@@ -916,6 +909,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--repeat-last-n"}, "N",
|
||||
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
|
||||
[](common_params & params, int value) {
|
||||
if (value < -1) {
|
||||
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
|
||||
}
|
||||
params.sampling.penalty_last_n = value;
|
||||
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
|
||||
}
|
||||
@@ -970,6 +966,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--dry-penalty-last-n"}, "N",
|
||||
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
|
||||
[](common_params & params, int value) {
|
||||
if (value < -1) {
|
||||
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
|
||||
}
|
||||
params.sampling.dry_penalty_last_n = value;
|
||||
}
|
||||
).set_sparam());
|
||||
|
||||
@@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos) {
|
||||
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
|
||||
if (llama_token_is_eog(model, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias.push_back({i, -INFINITY});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.sampling.penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
|
||||
+9
-6
@@ -95,6 +95,7 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@@ -130,7 +131,6 @@ struct common_params_sampling {
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool timing_per_token = false;
|
||||
@@ -139,6 +139,7 @@ struct common_params_sampling {
|
||||
|
||||
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
@@ -193,11 +194,13 @@ struct common_params {
|
||||
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
||||
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
||||
+11
-16
@@ -161,32 +161,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
params.logit_bias.size(),
|
||||
params.logit_bias.data()));
|
||||
|
||||
llama_sampler_chain_add(result->chain,
|
||||
llama_sampler_init_penalties(
|
||||
llama_n_vocab (model),
|
||||
llama_token_eos(model),
|
||||
llama_token_nl (model),
|
||||
params.penalty_last_n,
|
||||
params.penalty_repeat,
|
||||
params.penalty_freq,
|
||||
params.penalty_present,
|
||||
params.penalize_nl,
|
||||
params.ignore_eos));
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char*> c_breakers;
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto& str : params.dry_sequence_breakers) {
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
@@ -208,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
@@ -415,6 +406,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
|
||||
default : return '?';
|
||||
}
|
||||
}
|
||||
@@ -429,6 +421,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
|
||||
default : return "";
|
||||
}
|
||||
}
|
||||
@@ -443,6 +436,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
||||
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
||||
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
|
||||
};
|
||||
|
||||
// since samplers names are written multiple ways
|
||||
@@ -489,6 +483,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
|
||||
};
|
||||
|
||||
std::vector<common_sampler_type> samplers;
|
||||
|
||||
@@ -664,6 +664,9 @@ class Model:
|
||||
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
||||
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
||||
res = "roberta-bpe"
|
||||
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
|
||||
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
|
||||
res = "gigachat"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -3427,6 +3430,97 @@ class ArcticModel(Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("DeepseekForCausalLM")
|
||||
class DeepseekModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if "head_dim" in hparams:
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||
self.gguf_writer.add_expert_weights_scale(1.0)
|
||||
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# process the experts separately
|
||||
if name.find("mlp.experts") != -1:
|
||||
n_experts = self.hparams["n_routed_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("DeepseekV2ForCausalLM")
|
||||
class DeepseekV2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
|
||||
@@ -104,6 +104,7 @@ models = [
|
||||
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ int main(int argc, char ** argv) {
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
sparams.no_perf = false;
|
||||
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
|
||||
@@ -88,6 +88,8 @@ def main(args):
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
local_model = False
|
||||
model_path = ""
|
||||
model_name = args.model_name
|
||||
print("model_name: ", model_name)
|
||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
@@ -97,8 +99,10 @@ def main(args):
|
||||
vcfg = cfg.vision_config
|
||||
|
||||
if os.path.isdir(model_name):
|
||||
local_model = True
|
||||
if model_name.endswith(os.sep):
|
||||
model_name = model_name[:-1]
|
||||
model_path = model_name
|
||||
model_name = os.path.basename(model_name)
|
||||
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
|
||||
|
||||
@@ -139,7 +143,10 @@ def main(args):
|
||||
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
|
||||
"""
|
||||
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
|
||||
if local_model:
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
|
||||
else:
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
|
||||
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
|
||||
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
|
||||
|
||||
|
||||
@@ -177,16 +177,11 @@ Example usage: `--temp 0`
|
||||
|
||||
- `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
|
||||
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
|
||||
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
|
||||
|
||||
The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
|
||||
|
||||
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
|
||||
|
||||
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
|
||||
|
||||
Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
|
||||
|
||||
### DRY Repetition Penalty
|
||||
|
||||
DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
|
||||
|
||||
@@ -15,7 +15,7 @@ set(TARGET_SRCS
|
||||
httplib.h
|
||||
)
|
||||
set(PUBLIC_ASSETS
|
||||
index.html
|
||||
index.html.gz
|
||||
loading.html
|
||||
)
|
||||
|
||||
|
||||
@@ -104,7 +104,6 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||
| `--temp N` | temperature (default: 0.8) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||
@@ -393,8 +392,6 @@ These words will not be included in the completion, so make sure to add them to
|
||||
|
||||
`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
|
||||
|
||||
`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
|
||||
|
||||
`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
|
||||
|
||||
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
|
||||
@@ -441,19 +438,22 @@ These words will not be included in the completion, so make sure to add them to
|
||||
|
||||
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
|
||||
|
||||
`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
|
||||
|
||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
||||
|
||||
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
||||
|
||||
**Response format**
|
||||
|
||||
- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||
|
||||
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"content": "<the token selected by the model>",
|
||||
"content": "<the token generated by the model>",
|
||||
"tokens": [ generated token ids if requested ],
|
||||
"probs": [
|
||||
{
|
||||
"prob": float,
|
||||
@@ -471,6 +471,7 @@ These words will not be included in the completion, so make sure to add them to
|
||||
Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||
- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
|
||||
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
|
||||
- `model`: The path to the model loaded with `-m`
|
||||
@@ -655,7 +656,6 @@ This endpoint is public (no API key check). By default, it is read-only. To make
|
||||
"mirostat": 0,
|
||||
"mirostat_tau": 5.0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"penalize_nl": false,
|
||||
"stop": [],
|
||||
"max_tokens": -1,
|
||||
"n_keep": 0,
|
||||
@@ -845,7 +845,6 @@ Example:
|
||||
"mirostat": 0,
|
||||
"mirostat_tau": 5.0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"penalize_nl": false,
|
||||
"stop": [],
|
||||
"max_tokens": -1,
|
||||
"n_keep": 0,
|
||||
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
@@ -39,7 +39,6 @@
|
||||
temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
|
||||
repeat_last_n: 0, // 0 = disable penalty, -1 = context size
|
||||
repeat_penalty: 1.0, // 1.0 = disabled
|
||||
penalize_nl: false, // true only useful for infinite completion
|
||||
dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
|
||||
dry_base: 1.75, // 0.0 = disabled
|
||||
dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
|
||||
|
||||
@@ -303,7 +303,6 @@
|
||||
temperature: 0.7,
|
||||
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||
repeat_penalty: 1.18, // 1.0 = disabled
|
||||
penalize_nl: false,
|
||||
dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
|
||||
dry_base: 1.75, // 0.0 = disabled
|
||||
dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
|
||||
@@ -1006,7 +1005,6 @@
|
||||
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
|
||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
|
||||
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
|
||||
|
||||
+85
-28
@@ -15,7 +15,7 @@
|
||||
#define MIMETYPE_JSON "application/json; charset=utf-8"
|
||||
|
||||
// auto generated files (update with ./deps.sh)
|
||||
#include "index.html.hpp"
|
||||
#include "index.html.gz.hpp"
|
||||
#include "loading.html.hpp"
|
||||
|
||||
#include <atomic>
|
||||
@@ -79,8 +79,9 @@ enum error_type {
|
||||
};
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool return_tokens = false;
|
||||
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||
@@ -135,7 +136,6 @@ struct slot_params {
|
||||
{"mirostat", sampling.mirostat},
|
||||
{"mirostat_tau", sampling.mirostat_tau},
|
||||
{"mirostat_eta", sampling.mirostat_eta},
|
||||
{"penalize_nl", sampling.penalize_nl},
|
||||
{"stop", antiprompt},
|
||||
{"max_tokens", n_predict}, // User configured n_predict
|
||||
{"n_keep", n_keep},
|
||||
@@ -184,6 +184,7 @@ struct server_task {
|
||||
|
||||
static slot_params params_from_json_cmpl(
|
||||
const llama_model * model,
|
||||
const llama_context * ctx,
|
||||
const common_params & params_base,
|
||||
const json & data) {
|
||||
slot_params params;
|
||||
@@ -199,6 +200,7 @@ struct server_task {
|
||||
|
||||
params.stream = json_value(data, "stream", false);
|
||||
params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||
params.return_tokens = json_value(data, "return_tokens", false);
|
||||
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
||||
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
|
||||
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
|
||||
@@ -226,7 +228,6 @@ struct server_task {
|
||||
params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
|
||||
params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
|
||||
params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
|
||||
params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl);
|
||||
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
|
||||
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
|
||||
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
|
||||
@@ -239,8 +240,27 @@ struct server_task {
|
||||
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
||||
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
||||
|
||||
// TODO: add more sanity checks for the input parameters
|
||||
|
||||
if (params.sampling.penalty_last_n < -1) {
|
||||
throw std::runtime_error("Error: repeat_last_n must be >= -1");
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n < -1) {
|
||||
throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
|
||||
}
|
||||
|
||||
if (params.sampling.penalty_last_n == -1) {
|
||||
// note: should be the slot's context and not the full context, but it's ok
|
||||
params.sampling.penalty_last_n = llama_n_ctx(ctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n == -1) {
|
||||
params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_base < 1.0f) {
|
||||
params.sampling.dry_base = defaults.sampling.dry_base;
|
||||
params.sampling.dry_base = defaults.sampling.dry_base;
|
||||
}
|
||||
|
||||
// sequence breakers for DRY
|
||||
@@ -450,7 +470,10 @@ struct completion_token_output {
|
||||
|
||||
struct server_task_result_cmpl_final : server_task_result {
|
||||
int index = 0;
|
||||
std::string content;
|
||||
|
||||
std::string content;
|
||||
llama_tokens tokens;
|
||||
|
||||
bool stream;
|
||||
result_timings timings;
|
||||
std::string prompt;
|
||||
@@ -492,6 +515,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
json res = json {
|
||||
{"index", index},
|
||||
{"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk
|
||||
{"tokens", stream ? llama_tokens {} : tokens},
|
||||
{"id_slot", id_slot},
|
||||
{"stop", true},
|
||||
{"model", oaicompat_model},
|
||||
@@ -521,9 +545,9 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
json choices = json::array({json{
|
||||
{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"message", json{
|
||||
{"message", json {
|
||||
{"content", content},
|
||||
{"role", "assistant"}
|
||||
{"role", "assistant"}
|
||||
}
|
||||
}}});
|
||||
|
||||
@@ -587,7 +611,9 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
|
||||
struct server_task_result_cmpl_partial : server_task_result {
|
||||
int index = 0;
|
||||
std::string content;
|
||||
|
||||
std::string content;
|
||||
llama_tokens tokens;
|
||||
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
@@ -619,6 +645,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
json res = json {
|
||||
{"index", index},
|
||||
{"content", content},
|
||||
{"tokens", tokens},
|
||||
{"stop", false},
|
||||
{"id_slot", id_slot},
|
||||
{"tokens_predicted", n_decoded},
|
||||
@@ -660,7 +687,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
json second_ret = json{
|
||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"delta", json {
|
||||
{"content", content}}}
|
||||
}})},
|
||||
{"created", t},
|
||||
@@ -675,7 +702,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta",
|
||||
json{
|
||||
json {
|
||||
{"content", content},
|
||||
}},
|
||||
}});
|
||||
@@ -701,14 +728,17 @@ struct server_task_result_embd : server_task_result {
|
||||
int index = 0;
|
||||
std::vector<float> embedding;
|
||||
|
||||
int32_t n_tokens;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{"index", index},
|
||||
{"embedding", embedding},
|
||||
{"index", index},
|
||||
{"embedding", embedding},
|
||||
{"tokens_evaluated", n_tokens},
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -717,14 +747,17 @@ struct server_task_result_rerank : server_task_result {
|
||||
int index = 0;
|
||||
float score = -1e6;
|
||||
|
||||
int32_t n_tokens;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index;
|
||||
}
|
||||
|
||||
virtual json to_json() override {
|
||||
return json {
|
||||
{"index", index},
|
||||
{"score", score},
|
||||
{"index", index},
|
||||
{"score", score},
|
||||
{"tokens_evaluated", n_tokens},
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -931,8 +964,11 @@ struct server_slot {
|
||||
|
||||
size_t last_nl_pos = 0;
|
||||
|
||||
std::string generated_text;
|
||||
std::string generated_text;
|
||||
llama_tokens generated_tokens;
|
||||
|
||||
llama_tokens cache_tokens;
|
||||
|
||||
std::vector<completion_token_output> generated_token_probs;
|
||||
|
||||
bool has_next_token = true;
|
||||
@@ -976,6 +1012,7 @@ struct server_slot {
|
||||
n_sent_token_probs = 0;
|
||||
task_type = SERVER_TASK_TYPE_COMPLETION;
|
||||
|
||||
generated_tokens.clear();
|
||||
generated_token_probs.clear();
|
||||
}
|
||||
|
||||
@@ -1469,7 +1506,7 @@ struct server_context {
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_add_bos_token(model);
|
||||
has_eos_token = !llama_add_eos_token(model);
|
||||
has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!params_base.speculative.model.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
||||
@@ -1716,8 +1753,10 @@ struct server_context {
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
slot.generated_text += token_str;
|
||||
if (slot.params.return_tokens) {
|
||||
slot.generated_tokens.push_back(result.tok);
|
||||
}
|
||||
slot.has_next_token = true;
|
||||
|
||||
// check if there is incomplete UTF-8 character at the end
|
||||
@@ -1742,6 +1781,7 @@ struct server_context {
|
||||
break;
|
||||
}
|
||||
|
||||
// search stop word and delete it
|
||||
if (!incomplete) {
|
||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||
|
||||
@@ -1894,6 +1934,7 @@ struct server_context {
|
||||
res->id = slot.id_task;
|
||||
res->index = slot.index;
|
||||
res->content = tkn.text_to_send;
|
||||
res->tokens = { tkn.tok };
|
||||
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.n_prompt_tokens;
|
||||
@@ -1934,6 +1975,7 @@ struct server_context {
|
||||
|
||||
res->index = slot.index;
|
||||
res->content = slot.generated_text;
|
||||
res->tokens = slot.generated_tokens;
|
||||
res->timings = slot.get_timings();
|
||||
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
|
||||
|
||||
@@ -1977,6 +2019,7 @@ struct server_context {
|
||||
auto res = std::make_unique<server_task_result_embd>();
|
||||
res->id = slot.id_task;
|
||||
res->index = slot.index;
|
||||
res->n_tokens = slot.n_prompt_tokens;
|
||||
|
||||
const int n_embd = llama_n_embd(model);
|
||||
|
||||
@@ -2012,6 +2055,7 @@ struct server_context {
|
||||
auto res = std::make_unique<server_task_result_rerank>();
|
||||
res->id = slot.id_task;
|
||||
res->index = slot.index;
|
||||
res->n_tokens = slot.n_prompt_tokens;
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
@@ -3381,7 +3425,7 @@ int main(int argc, char ** argv) {
|
||||
task.index = i;
|
||||
|
||||
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
||||
task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.params_base, data);
|
||||
task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data);
|
||||
task.id_selected_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
// OAI-compat
|
||||
@@ -3625,25 +3669,33 @@ int main(int argc, char ** argv) {
|
||||
const json body = json::parse(req.body);
|
||||
bool oaicompat = false;
|
||||
|
||||
// an input prompt can be a string or a list of tokens (integer)
|
||||
// for the shape of input/content, see tokenize_input_prompts()
|
||||
json prompt;
|
||||
if (body.count("input") != 0) {
|
||||
if (body.contains("input")) {
|
||||
oaicompat = true;
|
||||
prompt = body.at("input");
|
||||
} else if (body.count("content") != 0) {
|
||||
// with "content", we only support single prompt
|
||||
prompt = std::vector<std::string>{body.at("content")};
|
||||
} else if (body.contains("content")) {
|
||||
oaicompat = false;
|
||||
prompt = body.at("content");
|
||||
} else {
|
||||
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
||||
for (const auto & tokens : tokenized_prompts) {
|
||||
// this check is necessary for models that do not add BOS token to the input
|
||||
if (tokens.empty()) {
|
||||
res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, /* add_special */ false, true);
|
||||
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
||||
server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
@@ -3828,8 +3880,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
} else {
|
||||
// using embedded static index.html
|
||||
svr->Get("/", [](const httplib::Request &, httplib::Response & res) {
|
||||
res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
|
||||
svr->Get("/", [](const httplib::Request & req, httplib::Response & res) {
|
||||
if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
|
||||
res.set_content("Error: gzip is not supported by this browser", "text/plain");
|
||||
} else {
|
||||
res.set_header("Content-Encoding", "gzip");
|
||||
res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -10,16 +10,17 @@ def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
|
||||
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
|
||||
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
|
||||
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
|
||||
("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
|
||||
("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
|
||||
])
|
||||
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
|
||||
def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": n_predict,
|
||||
"prompt": prompt,
|
||||
"return_tokens": return_tokens,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["timings"]["prompt_n"] == n_prompt
|
||||
@@ -27,6 +28,11 @@ def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int,
|
||||
assert res.body["truncated"] == truncated
|
||||
assert type(res.body["has_new_line"]) == bool
|
||||
assert match_regex(re_content, res.body["content"])
|
||||
if return_tokens:
|
||||
assert len(res.body["tokens"]) > 0
|
||||
assert all(type(tok) == int for tok in res.body["tokens"])
|
||||
else:
|
||||
assert res.body["tokens"] == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
|
||||
@@ -56,6 +62,8 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
|
||||
assert data["generation_settings"]["seed"] == server.seed
|
||||
assert match_regex(re_content, content)
|
||||
else:
|
||||
assert len(data["tokens"]) > 0
|
||||
assert all(type(tok) == int for tok in data["tokens"])
|
||||
content += data["content"]
|
||||
|
||||
|
||||
|
||||
@@ -45,6 +45,35 @@ def test_embedding_multiple():
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"content,is_multi_prompt",
|
||||
[
|
||||
# single prompt
|
||||
("string", False),
|
||||
([12, 34, 56], False),
|
||||
([12, 34, "string", 56, 78], False),
|
||||
# multiple prompts
|
||||
(["string1", "string2"], True),
|
||||
(["string1", [12, 34, 56]], True),
|
||||
([[12, 34, 56], [12, 34, 56]], True),
|
||||
([[12, 34, 56], [12, "string", 34, 56]], True),
|
||||
]
|
||||
)
|
||||
def test_embedding_mixed_input(content, is_multi_prompt: bool):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/embeddings", data={"content": content})
|
||||
assert res.status_code == 200
|
||||
if is_multi_prompt:
|
||||
assert len(res.body) == len(content)
|
||||
for d in res.body:
|
||||
assert 'embedding' in d
|
||||
assert len(d['embedding']) > 1
|
||||
else:
|
||||
assert 'embedding' in res.body
|
||||
assert len(res.body['embedding']) > 1
|
||||
|
||||
|
||||
def test_embedding_openai_library_single():
|
||||
global server
|
||||
server.start()
|
||||
@@ -97,3 +126,33 @@ def test_same_prompt_give_same_result():
|
||||
vi = res.body['data'][i]['embedding']
|
||||
for x, y in zip(v0, vi):
|
||||
assert abs(x - y) < EPSILON
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"content,n_tokens",
|
||||
[
|
||||
("I believe the meaning of life is", 9),
|
||||
("This is a test", 6),
|
||||
]
|
||||
)
|
||||
def test_embedding_usage_single(content, n_tokens):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/embeddings", data={"input": content})
|
||||
assert res.status_code == 200
|
||||
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
|
||||
assert res.body['usage']['prompt_tokens'] == n_tokens
|
||||
|
||||
|
||||
def test_embedding_usage_multiple():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/embeddings", data={
|
||||
"input": [
|
||||
"I believe the meaning of life is",
|
||||
"I believe the meaning of life is",
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
|
||||
assert res.body['usage']['prompt_tokens'] == 2 * 9
|
||||
|
||||
@@ -53,3 +53,26 @@ def test_invalid_rerank_req(documents):
|
||||
})
|
||||
assert res.status_code == 400
|
||||
assert "error" in res.body
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"query,doc1,doc2,n_tokens",
|
||||
[
|
||||
("Machine learning is", "A machine", "Learning is", 19),
|
||||
("Which city?", "Machine learning is ", "Paris, capitale de la", 26),
|
||||
]
|
||||
)
|
||||
def test_rerank_usage(query, doc1, doc2, n_tokens):
|
||||
global server
|
||||
server.start()
|
||||
|
||||
res = server.make_request("POST", "/rerank", data={
|
||||
"query": query,
|
||||
"documents": [
|
||||
doc1,
|
||||
doc2,
|
||||
]
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
|
||||
assert res.body['usage']['prompt_tokens'] == n_tokens
|
||||
|
||||
@@ -222,7 +222,6 @@
|
||||
temperature: 0.7,
|
||||
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||
repeat_penalty: 1.18, // 1.0 = disabled
|
||||
penalize_nl: false,
|
||||
top_k: 40, // <= 0 to use vocab size
|
||||
top_p: 0.95, // 1.0 = disabled
|
||||
min_p: 0.05, // 0 = disabled
|
||||
@@ -779,7 +778,6 @@
|
||||
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
|
||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
|
||||
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
|
||||
|
||||
@@ -225,7 +225,6 @@
|
||||
temperature: 0.7,
|
||||
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||
repeat_penalty: 1.18, // 1.0 = disabled
|
||||
penalize_nl: false,
|
||||
top_k: 40, // <= 0 to use vocab size
|
||||
top_p: 0.95, // 1.0 = disabled
|
||||
min_p: 0.05, // 0 = disabled
|
||||
@@ -782,7 +781,6 @@
|
||||
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
|
||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
|
||||
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
|
||||
|
||||
@@ -138,6 +138,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
|
||||
* and multiple prompts (multi-tasks):
|
||||
* - "prompt": ["string1", "string2"]
|
||||
* - "prompt": ["string1", [12, 34, 56]]
|
||||
* - "prompt": [[12, 34, 56], [78, 90, 12]]
|
||||
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
|
||||
*/
|
||||
static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
|
||||
@@ -560,6 +561,7 @@ static json oaicompat_completion_params_parse(
|
||||
|
||||
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
||||
json data = json::array();
|
||||
int32_t n_tokens = 0;
|
||||
int i = 0;
|
||||
for (const auto & elem : embeddings) {
|
||||
data.push_back(json{
|
||||
@@ -567,14 +569,16 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
||||
{"index", i++},
|
||||
{"object", "embedding"}
|
||||
});
|
||||
|
||||
n_tokens += json_value(elem, "tokens_evaluated", 0);
|
||||
}
|
||||
|
||||
json res = json {
|
||||
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"object", "list"},
|
||||
{"usage", json { // TODO: fill
|
||||
{"prompt_tokens", 0},
|
||||
{"total_tokens", 0}
|
||||
{"usage", json {
|
||||
{"prompt_tokens", n_tokens},
|
||||
{"total_tokens", n_tokens}
|
||||
}},
|
||||
{"data", data}
|
||||
};
|
||||
@@ -584,20 +588,23 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
||||
|
||||
static json format_response_rerank(const json & request, const json & ranks) {
|
||||
json data = json::array();
|
||||
int32_t n_tokens = 0;
|
||||
int i = 0;
|
||||
for (const auto & rank : ranks) {
|
||||
data.push_back(json{
|
||||
{"index", i++},
|
||||
{"relevance_score", json_value(rank, "score", 0.0)},
|
||||
});
|
||||
|
||||
n_tokens += json_value(rank, "tokens_evaluated", 0);
|
||||
}
|
||||
|
||||
json res = json {
|
||||
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"object", "list"},
|
||||
{"usage", json { // TODO: fill
|
||||
{"prompt_tokens", 0},
|
||||
{"total_tokens", 0}
|
||||
{"usage", json {
|
||||
{"prompt_tokens", n_tokens},
|
||||
{"total_tokens", n_tokens}
|
||||
}},
|
||||
{"results", data}
|
||||
};
|
||||
|
||||
@@ -201,6 +201,10 @@
|
||||
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
||||
<summary class="collapse-title font-bold">Advanced config</summary>
|
||||
<div class="collapse-content">
|
||||
<div class="flex flex-row items-center mb-2" v-if="isDev">
|
||||
<!-- this button only shows in dev mode, used to import a demo conversation to test message rendering -->
|
||||
<button class="btn" @click="debugImportDemoConv()">(debug) Import demo conversation</button>
|
||||
</div>
|
||||
<div class="flex flex-row items-center mb-2">
|
||||
<input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
|
||||
<span class="ml-4">Show tokens per second</span>
|
||||
|
||||
Generated
+519
@@ -8,8 +8,12 @@
|
||||
"name": "webui",
|
||||
"version": "0.0.0",
|
||||
"dependencies": {
|
||||
"@sec-ant/readable-stream": "^0.6.0",
|
||||
"@vscode/markdown-it-katex": "^1.1.1",
|
||||
"autoprefixer": "^10.4.20",
|
||||
"daisyui": "^4.12.14",
|
||||
"highlight.js": "^11.10.0",
|
||||
"katex": "^0.16.15",
|
||||
"markdown-it": "^14.1.0",
|
||||
"postcss": "^8.4.49",
|
||||
"tailwindcss": "^3.4.15",
|
||||
@@ -18,6 +22,7 @@
|
||||
"vue": "^3.5.13"
|
||||
},
|
||||
"devDependencies": {
|
||||
"sass-embedded": "^1.83.0",
|
||||
"vite": "^5.4.10"
|
||||
}
|
||||
},
|
||||
@@ -33,6 +38,13 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/@bufbuild/protobuf": {
|
||||
"version": "2.2.3",
|
||||
"resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.2.3.tgz",
|
||||
"integrity": "sha512-tFQoXHJdkEOSwj5tRIZSPNUuXK3RaR7T1nUrPgbYX1pUbvqqaaZAsfo+NXBPsz5rZMSKVFrgK1WL8Q/MSLvprg==",
|
||||
"devOptional": true,
|
||||
"license": "(Apache-2.0 AND BSD-3-Clause)"
|
||||
},
|
||||
"node_modules/@esbuild/aix-ppc64": {
|
||||
"version": "0.21.5",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz",
|
||||
@@ -606,6 +618,21 @@
|
||||
"win32"
|
||||
]
|
||||
},
|
||||
"node_modules/@sec-ant/readable-stream": {
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.6.0.tgz",
|
||||
"integrity": "sha512-uiBh8DrB5FN35gP6/o8JEhEQ7/ci1jUsOZO/VMUjyvTpjtV54VstOXVj1TvTj/wsT23pfX6butxxh3qufsW3+g==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@vscode/markdown-it-katex": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/@vscode/markdown-it-katex/-/markdown-it-katex-1.1.1.tgz",
|
||||
"integrity": "sha512-3KTlbsRBPJQLE2YmLL7K6nunTlU+W9T5+FjfNdWuIUKgxSS6HWLQHaO3L4MkJi7z7MpIPpY+g4N+cWNBPE/MSA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"katex": "^0.16.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@vue/compiler-dom": {
|
||||
"version": "3.5.13",
|
||||
"resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz",
|
||||
@@ -1004,6 +1031,13 @@
|
||||
"browserslist": ">= 4.21.0"
|
||||
}
|
||||
},
|
||||
"node_modules/buffer-builder": {
|
||||
"version": "0.2.0",
|
||||
"resolved": "https://registry.npmjs.org/buffer-builder/-/buffer-builder-0.2.0.tgz",
|
||||
"integrity": "sha512-7VPMEPuYznPSoR21NE1zvd2Xna6c/CloiZCfcMXR1Jny6PjX0N4Nsa38zcBFo/FMK+BlA+FLKbJCQ0i2yxp+Xg==",
|
||||
"devOptional": true,
|
||||
"license": "MIT/X11"
|
||||
},
|
||||
"node_modules/caniuse-lite": {
|
||||
"version": "1.0.30001684",
|
||||
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz",
|
||||
@@ -1166,6 +1200,22 @@
|
||||
"node": ">=8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/colorjs.io": {
|
||||
"version": "0.5.2",
|
||||
"resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz",
|
||||
"integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==",
|
||||
"devOptional": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/commander": {
|
||||
"version": "8.3.0",
|
||||
"resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz",
|
||||
"integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/css-selector-tokenizer": {
|
||||
"version": "0.8.0",
|
||||
"resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz",
|
||||
@@ -1473,6 +1523,31 @@
|
||||
"node": ">=10.13.0"
|
||||
}
|
||||
},
|
||||
"node_modules/has-flag": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
|
||||
"integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/highlight.js": {
|
||||
"version": "11.10.0",
|
||||
"resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.10.0.tgz",
|
||||
"integrity": "sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==",
|
||||
"engines": {
|
||||
"node": ">=12.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/immutable": {
|
||||
"version": "5.0.3",
|
||||
"resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz",
|
||||
"integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==",
|
||||
"devOptional": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/is-glob": {
|
||||
"version": "4.0.3",
|
||||
"resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
|
||||
@@ -1503,6 +1578,22 @@
|
||||
"jiti": "bin/jiti.js"
|
||||
}
|
||||
},
|
||||
"node_modules/katex": {
|
||||
"version": "0.16.15",
|
||||
"resolved": "https://registry.npmjs.org/katex/-/katex-0.16.15.tgz",
|
||||
"integrity": "sha512-yE9YJIEAk2aZ+FL/G8r+UGw0CTUzEA8ZFy6E+8tc3spHUKq3qBnzCkI1CQwGoI9atJhVyFPEypQsTY7mJ1Pi9w==",
|
||||
"funding": [
|
||||
"https://opencollective.com/katex",
|
||||
"https://github.com/sponsors/katex"
|
||||
],
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"commander": "^8.3.0"
|
||||
},
|
||||
"bin": {
|
||||
"katex": "cli.js"
|
||||
}
|
||||
},
|
||||
"node_modules/lilconfig": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz",
|
||||
@@ -2022,6 +2113,381 @@
|
||||
"integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/rxjs": {
|
||||
"version": "7.8.1",
|
||||
"resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz",
|
||||
"integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==",
|
||||
"devOptional": true,
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"tslib": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.83.0.tgz",
|
||||
"integrity": "sha512-/8cYZeL39evUqe0o//193na51Q1VWZ61qhxioQvLJwOtWIrX+PgNhCyD8RSuTtmzc4+6+waFZf899bfp/MCUwA==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@bufbuild/protobuf": "^2.0.0",
|
||||
"buffer-builder": "^0.2.0",
|
||||
"colorjs.io": "^0.5.0",
|
||||
"immutable": "^5.0.2",
|
||||
"rxjs": "^7.4.0",
|
||||
"supports-color": "^8.1.1",
|
||||
"sync-child-process": "^1.0.2",
|
||||
"varint": "^6.0.0"
|
||||
},
|
||||
"bin": {
|
||||
"sass": "dist/bin/sass.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"sass-embedded-android-arm": "1.83.0",
|
||||
"sass-embedded-android-arm64": "1.83.0",
|
||||
"sass-embedded-android-ia32": "1.83.0",
|
||||
"sass-embedded-android-riscv64": "1.83.0",
|
||||
"sass-embedded-android-x64": "1.83.0",
|
||||
"sass-embedded-darwin-arm64": "1.83.0",
|
||||
"sass-embedded-darwin-x64": "1.83.0",
|
||||
"sass-embedded-linux-arm": "1.83.0",
|
||||
"sass-embedded-linux-arm64": "1.83.0",
|
||||
"sass-embedded-linux-ia32": "1.83.0",
|
||||
"sass-embedded-linux-musl-arm": "1.83.0",
|
||||
"sass-embedded-linux-musl-arm64": "1.83.0",
|
||||
"sass-embedded-linux-musl-ia32": "1.83.0",
|
||||
"sass-embedded-linux-musl-riscv64": "1.83.0",
|
||||
"sass-embedded-linux-musl-x64": "1.83.0",
|
||||
"sass-embedded-linux-riscv64": "1.83.0",
|
||||
"sass-embedded-linux-x64": "1.83.0",
|
||||
"sass-embedded-win32-arm64": "1.83.0",
|
||||
"sass-embedded-win32-ia32": "1.83.0",
|
||||
"sass-embedded-win32-x64": "1.83.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-android-arm": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.83.0.tgz",
|
||||
"integrity": "sha512-uwFSXzJlfbd4Px189xE5l+cxN8+TQpXdQgJec7TIrb4HEY7imabtpYufpVdqUVwT1/uiis5V4+qIEC4Vl5XObQ==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-android-arm64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.83.0.tgz",
|
||||
"integrity": "sha512-GBiCvM4a2rkWBLdYDxI6XYnprfk5U5c81g69RC2X6kqPuzxzx8qTArQ9M6keFK4+iDQ5N9QTwFCr0KbZTn+ZNQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-android-ia32": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-android-ia32/-/sass-embedded-android-ia32-1.83.0.tgz",
|
||||
"integrity": "sha512-5ATPdGo2SICqAhiJl/Z8KQ23zH4sGgobGgux0TnrNtt83uHZ+r+To/ubVJ7xTkZxed+KJZnIpolGD8dQyQqoTg==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-android-riscv64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.83.0.tgz",
|
||||
"integrity": "sha512-aveknUOB8GZewOzVn2Uwk+DKcncTR50Q6vtzslNMGbYnxtgQNHzy8A1qVEviNUruex+pHofppeMK4iMPFAbiEQ==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-android-x64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.83.0.tgz",
|
||||
"integrity": "sha512-WqIay/72ncyf9Ph4vS742J3a73wZihWmzFUwpn1OD6lme1Aj4eWzWIve5IVnlTEJgcZcDHu6ECID9IZgehJKoA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-darwin-arm64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.83.0.tgz",
|
||||
"integrity": "sha512-XQl9QqgxFFIPm/CzHhmppse5o9ocxrbaAdC2/DAnlAqvYWBBtgFqPjGoYlej13h9SzfvNoogx+y9r+Ap+e+hYg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-darwin-x64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.83.0.tgz",
|
||||
"integrity": "sha512-ERQ7Tvp1kFOW3ux4VDFIxb7tkYXHYc+zJpcrbs0hzcIO5ilIRU2tIOK1OrNwrFO6Qxyf7AUuBwYKLAtIU/Nz7g==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-arm": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.83.0.tgz",
|
||||
"integrity": "sha512-baG9RYBJxUFmqwDNC9h9ZFElgJoyO3jgHGjzEZ1wHhIS9anpG+zZQvO8bHx3dBpKEImX+DBeLX+CxsFR9n81gQ==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-arm64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.83.0.tgz",
|
||||
"integrity": "sha512-syEAVTJt4qhaMLxrSwOWa46zdqHJdnqJkLUK+t9aCr8xqBZLPxSUeIGji76uOehQZ1C+KGFj6n9xstHN6wzOJw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-ia32": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-ia32/-/sass-embedded-linux-ia32-1.83.0.tgz",
|
||||
"integrity": "sha512-RRBxQxMpoxu5+XcSSc6QR/o9asEwUzR8AbCS83RaXcdTIHTa/CccQsiAoDDoPlRsMTLqnzs0LKL4CfOsf7zBbA==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-musl-arm": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.83.0.tgz",
|
||||
"integrity": "sha512-Yc7u2TelCfBab+PRob9/MNJFh3EooMiz4urvhejXkihTiKSHGCv5YqDdtWzvyb9tY2Jb7YtYREVuHwfdVn3dTQ==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-musl-arm64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.83.0.tgz",
|
||||
"integrity": "sha512-Y7juhPHClUO2H5O+u+StRy6SEAcwZ+hTEk5WJdEmo1Bb1gDtfHvJaWB/iFZJ2tW0W1e865AZeUrC4OcOFjyAQA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-musl-ia32": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-ia32/-/sass-embedded-linux-musl-ia32-1.83.0.tgz",
|
||||
"integrity": "sha512-arQeYwGmwXV8byx5G1PtSzZWW1jbkfR5qrIHMEbTFSAvAxpqjgSvCvrHMOFd73FcMxVaYh4BX9LQNbKinkbEdg==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-musl-riscv64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.83.0.tgz",
|
||||
"integrity": "sha512-E6uzlIWz59rut+Z3XR6mLG915zNzv07ISvj3GUNZENdHM7dF8GQ//ANoIpl5PljMQKp89GnYdvo6kj2gnaBf/g==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-musl-x64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.83.0.tgz",
|
||||
"integrity": "sha512-eAMK6tyGqvqr21r9g8BnR3fQc1rYFj85RGduSQ3xkITZ6jOAnOhuU94N5fwRS852Hpws0lXhET+7JHXgg3U18w==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-riscv64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.83.0.tgz",
|
||||
"integrity": "sha512-Ojpi78pTv02sy2fUYirRGXHLY3fPnV/bvwuC2i5LwPQw2LpCcFyFTtN0c5h4LJDk9P6wr+/ZB/JXU8tHIOlK+Q==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-linux-x64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.83.0.tgz",
|
||||
"integrity": "sha512-3iLjlXdoPfgZRtX4odhRvka1BQs5mAXqfCtDIQBgh/o0JnGPzJIWWl9bYLpHxK8qb+uyVBxXYgXpI0sCzArBOw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-win32-arm64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.83.0.tgz",
|
||||
"integrity": "sha512-iOHw/8/t2dlTW3lOFwG5eUbiwhEyGWawivlKWJ8lkXH7fjMpVx2VO9zCFAm8RvY9xOHJ9sf1L7g5bx3EnNP9BQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-win32-ia32": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-win32-ia32/-/sass-embedded-win32-ia32-1.83.0.tgz",
|
||||
"integrity": "sha512-2PxNXJ8Pad4geVcTXY4rkyTr5AwbF8nfrCTDv0ulbTvPhzX2mMKEGcBZUXWn5BeHZTBc6whNMfS7d5fQXR9dDQ==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sass-embedded-win32-x64": {
|
||||
"version": "1.83.0",
|
||||
"resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.83.0.tgz",
|
||||
"integrity": "sha512-muBXkFngM6eLTNqOV0FQi7Dv9s+YRQ42Yem26mosdan/GmJQc81deto6uDTgrYn+bzFNmiXcOdfm+0MkTWK3OQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sucrase": {
|
||||
"version": "3.35.0",
|
||||
"resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz",
|
||||
@@ -2641,6 +3107,45 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/supports-color": {
|
||||
"version": "8.1.1",
|
||||
"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
|
||||
"integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"has-flag": "^4.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/chalk/supports-color?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/sync-child-process": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz",
|
||||
"integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"sync-message-port": "^1.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/sync-message-port": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.1.3.tgz",
|
||||
"integrity": "sha512-GTt8rSKje5FilG+wEdfCkOcLL7LWqpMlr2c3LRuKt/YXxcJ52aGSbGBAdI4L3aaqfrBt6y711El53ItyH1NWzg==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/tailwindcss": {
|
||||
"version": "3.4.15",
|
||||
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz",
|
||||
@@ -2684,12 +3189,26 @@
|
||||
"integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tslib": {
|
||||
"version": "2.8.1",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
|
||||
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
|
||||
"devOptional": true,
|
||||
"license": "0BSD"
|
||||
},
|
||||
"node_modules/uc.micro": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
|
||||
"integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/varint": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz",
|
||||
"integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==",
|
||||
"devOptional": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/vite": {
|
||||
"version": "5.4.11",
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz",
|
||||
|
||||
@@ -6,14 +6,20 @@
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "vite build",
|
||||
"preview": "vite preview"
|
||||
"preview": "vite preview",
|
||||
"analyze": "ANALYZE=1 npx vite-bundle-visualizer"
|
||||
},
|
||||
"devDependencies": {
|
||||
"sass-embedded": "^1.83.0",
|
||||
"vite": "^5.4.10"
|
||||
},
|
||||
"dependencies": {
|
||||
"@sec-ant/readable-stream": "^0.6.0",
|
||||
"@vscode/markdown-it-katex": "^1.1.1",
|
||||
"autoprefixer": "^10.4.20",
|
||||
"daisyui": "^4.12.14",
|
||||
"highlight.js": "^11.10.0",
|
||||
"katex": "^0.16.15",
|
||||
"markdown-it": "^14.1.0",
|
||||
"postcss": "^8.4.49",
|
||||
"tailwindcss": "^3.4.15",
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"demo": true,
|
||||
"id": "conv-1734086746930",
|
||||
"lastModified": 1734087548943,
|
||||
"messages": [
|
||||
{
|
||||
"id": 1734086764521,
|
||||
"role": "user",
|
||||
"content": "this is a demo conversation, used in dev mode"
|
||||
},
|
||||
{
|
||||
"id": 1734087548327,
|
||||
"role": "assistant",
|
||||
"content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
|
||||
"timings": {
|
||||
"prompt_n": 1,
|
||||
"prompt_ms": 28.923,
|
||||
"predicted_n": 25,
|
||||
"predicted_ms": 573.016
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 1734087548328,
|
||||
"role": "user",
|
||||
"content": "this is a demo conversation, used in dev mode"
|
||||
},
|
||||
{
|
||||
"id": 1734087548329,
|
||||
"role": "assistant",
|
||||
"content": "Code block:\n```js\nconsole.log('hello world')\n```\n```sh\nls -la /dev\n```"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
import hljs from 'highlight.js/lib/core';
|
||||
|
||||
// only import commonly used languages to reduce bundle size
|
||||
|
||||
import python from 'highlight.js/lib/languages/python';
|
||||
import javascript from 'highlight.js/lib/languages/javascript';
|
||||
import json from 'highlight.js/lib/languages/json';
|
||||
import bash from 'highlight.js/lib/languages/bash';
|
||||
import yaml from 'highlight.js/lib/languages/yaml';
|
||||
import markdown from 'highlight.js/lib/languages/markdown';
|
||||
import scss from 'highlight.js/lib/languages/scss';
|
||||
import xml from 'highlight.js/lib/languages/xml';
|
||||
import ruby from 'highlight.js/lib/languages/ruby';
|
||||
import go from 'highlight.js/lib/languages/go';
|
||||
import java from 'highlight.js/lib/languages/java';
|
||||
import rust from 'highlight.js/lib/languages/rust';
|
||||
import scala from 'highlight.js/lib/languages/scala';
|
||||
import cpp from 'highlight.js/lib/languages/cpp';
|
||||
import csharp from 'highlight.js/lib/languages/csharp';
|
||||
import swift from 'highlight.js/lib/languages/swift';
|
||||
import dart from 'highlight.js/lib/languages/dart';
|
||||
import elixir from 'highlight.js/lib/languages/elixir';
|
||||
import kotlin from 'highlight.js/lib/languages/kotlin';
|
||||
import lua from 'highlight.js/lib/languages/lua';
|
||||
import php from 'highlight.js/lib/languages/php';
|
||||
import latex from 'highlight.js/lib/languages/latex';
|
||||
|
||||
hljs.registerLanguage('python', python);
|
||||
hljs.registerLanguage('javascript', javascript);
|
||||
hljs.registerLanguage('json', json);
|
||||
hljs.registerLanguage('yaml', yaml);
|
||||
hljs.registerLanguage('markdown', markdown);
|
||||
hljs.registerLanguage('xml', xml);
|
||||
hljs.registerLanguage('ruby', ruby);
|
||||
hljs.registerLanguage('go', go);
|
||||
hljs.registerLanguage('java', java);
|
||||
hljs.registerLanguage('rust', rust);
|
||||
hljs.registerLanguage('scala', scala);
|
||||
hljs.registerLanguage('csharp', csharp);
|
||||
hljs.registerLanguage('swift', swift);
|
||||
hljs.registerLanguage('dart', dart);
|
||||
hljs.registerLanguage('elixir', elixir);
|
||||
hljs.registerLanguage('kotlin', kotlin);
|
||||
hljs.registerLanguage('lua', lua);
|
||||
hljs.registerLanguage('php', php);
|
||||
hljs.registerLanguage('latex', latex);
|
||||
|
||||
// reuse some languages to further reduce bundle size
|
||||
|
||||
hljs.registerLanguage('shell', bash);
|
||||
hljs.registerLanguage('bash', bash);
|
||||
hljs.registerLanguage('sh', bash);
|
||||
|
||||
hljs.registerLanguage('css', scss);
|
||||
hljs.registerLanguage('scss', scss);
|
||||
|
||||
hljs.registerLanguage('c', cpp);
|
||||
hljs.registerLanguage('cpp', cpp);
|
||||
|
||||
export default hljs;
|
||||
@@ -0,0 +1,66 @@
|
||||
import katex from 'katex';
|
||||
|
||||
// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt
|
||||
// MIT license
|
||||
|
||||
const defaultOptions = {
|
||||
delimiters: [
|
||||
{ left: '\\[', right: '\\]', display: true },
|
||||
{ left: '\\(', right: '\\)', display: false },
|
||||
],
|
||||
};
|
||||
|
||||
export function renderLatexHTML(content, display = false) {
|
||||
return katex.renderToString(content, {
|
||||
throwOnError: false,
|
||||
output: 'mathml',
|
||||
displayMode: display,
|
||||
});
|
||||
}
|
||||
|
||||
function escapedBracketRule(options) {
|
||||
return (state, silent) => {
|
||||
const max = state.posMax;
|
||||
const start = state.pos;
|
||||
|
||||
for (const { left, right, display } of options.delimiters) {
|
||||
|
||||
// Check if it starts with the left delimiter
|
||||
if (!state.src.slice(start).startsWith(left)) continue;
|
||||
|
||||
// Skip the length of the left delimiter
|
||||
let pos = start + left.length;
|
||||
|
||||
// Find the matching right delimiter
|
||||
while (pos < max) {
|
||||
if (state.src.slice(pos).startsWith(right)) {
|
||||
break;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
||||
// No matching right delimiter found, skip to the next match
|
||||
if (pos >= max) continue;
|
||||
|
||||
// If not in silent mode, convert LaTeX formula to MathML
|
||||
if (!silent) {
|
||||
const content = state.src.slice(start + left.length, pos);
|
||||
try {
|
||||
const renderedContent = renderLatexHTML(content, display);
|
||||
const token = state.push('html_inline', '', 0);
|
||||
token.content = renderedContent;
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
}
|
||||
}
|
||||
|
||||
// Update position, skip the length of the right delimiter
|
||||
state.pos = pos + right.length;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export default function (md, options = defaultOptions) {
|
||||
md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options));
|
||||
}
|
||||
@@ -1,8 +1,20 @@
|
||||
import './styles.css';
|
||||
import './styles.scss';
|
||||
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
|
||||
import MarkdownIt from 'markdown-it';
|
||||
import TextLineStream from 'textlinestream';
|
||||
|
||||
// math formula rendering
|
||||
import 'katex/dist/katex.min.css';
|
||||
import markdownItKatexGpt from './katex-gpt';
|
||||
import markdownItKatexNormal from '@vscode/markdown-it-katex';
|
||||
|
||||
// code highlighting
|
||||
import hljs from './highlight-config';
|
||||
import daisyuiThemes from 'daisyui/src/theming/themes';
|
||||
|
||||
// ponyfill for missing ReadableStream asyncIterator on Safari
|
||||
import { asyncIterator } from "@sec-ant/readable-stream/ponyfill/asyncIterator";
|
||||
|
||||
const isDev = import.meta.env.MODE === 'development';
|
||||
|
||||
// utility functions
|
||||
@@ -13,15 +25,18 @@ const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"');
|
||||
const copyStr = (str) => navigator.clipboard.writeText(str);
|
||||
|
||||
// constants
|
||||
const BASE_URL = localStorage.getItem('base') // for debugging
|
||||
|| (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
|
||||
const BASE_URL = isDev
|
||||
? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging
|
||||
: (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
|
||||
console.log({ BASE_URL });
|
||||
|
||||
const CONFIG_DEFAULT = {
|
||||
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
|
||||
apiKey: '',
|
||||
systemMessage: 'You are a helpful assistant.',
|
||||
showTokensPerSecond: false,
|
||||
// make sure these default values are in sync with `common.h`
|
||||
samplers: 'dkypmxt',
|
||||
samplers: 'edkypmxt',
|
||||
temperature: 0.8,
|
||||
dynatemp_range: 0.0,
|
||||
dynatemp_exponent: 1.0,
|
||||
@@ -69,12 +84,39 @@ const CONFIG_INFO = {
|
||||
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
|
||||
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
|
||||
// list of themes supported by daisyui
|
||||
const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
|
||||
const THEMES = ['light', 'dark']
|
||||
// make sure light & dark are always at the beginning
|
||||
.concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark'));
|
||||
|
||||
// markdown support
|
||||
const VueMarkdown = defineComponent(
|
||||
(props) => {
|
||||
const md = shallowRef(new MarkdownIt({ breaks: true }));
|
||||
const md = shallowRef(new MarkdownIt({
|
||||
breaks: true,
|
||||
highlight: function (str, lang) { // Add highlight.js
|
||||
if (lang && hljs.getLanguage(lang)) {
|
||||
try {
|
||||
return '<pre><code class="hljs">' +
|
||||
hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
|
||||
'</code></pre>';
|
||||
} catch (__) {}
|
||||
}
|
||||
return '<pre><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
|
||||
}
|
||||
}));
|
||||
// support latex with double dollar sign and square brackets
|
||||
md.value.use(markdownItKatexGpt, {
|
||||
delimiters: [
|
||||
{ left: '\\[', right: '\\]', display: true },
|
||||
{ left: '\\(', right: '\\)', display: false },
|
||||
{ left: '$$', right: '$$', display: false },
|
||||
// do not add single dollar sign here, other wise it will confused with dollar used for money symbol
|
||||
],
|
||||
throwOnError: false,
|
||||
});
|
||||
// support latex with single dollar sign
|
||||
md.value.use(markdownItKatexNormal, { throwOnError: false });
|
||||
// add copy button to code blocks
|
||||
const origFenchRenderer = md.value.renderer.rules.fence;
|
||||
md.value.renderer.rules.fence = (tokens, idx, ...args) => {
|
||||
const content = tokens[idx].content;
|
||||
@@ -244,7 +286,7 @@ async function* sendSSEPostRequest(url, fetchOptions) {
|
||||
const lines = res.body
|
||||
.pipeThrough(new TextDecoderStream())
|
||||
.pipeThrough(new TextLineStream());
|
||||
for await (const line of lines) {
|
||||
for await (const line of asyncIterator(lines)) {
|
||||
if (isDev) console.log({line});
|
||||
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
|
||||
const data = JSON.parse(line.slice(5));
|
||||
@@ -278,6 +320,7 @@ const mainApp = createApp({
|
||||
themes: THEMES,
|
||||
configDefault: {...CONFIG_DEFAULT},
|
||||
configInfo: {...CONFIG_INFO},
|
||||
isDev,
|
||||
}
|
||||
},
|
||||
computed: {},
|
||||
@@ -289,6 +332,7 @@ const mainApp = createApp({
|
||||
if (this.isGenerating) chatScrollToBottom(true);
|
||||
});
|
||||
resizeObserver.observe(pendingMsgElem);
|
||||
this.setSelectedTheme(this.selectedTheme);
|
||||
},
|
||||
watch: {
|
||||
viewingConvId: function(val, oldVal) {
|
||||
@@ -305,6 +349,8 @@ const mainApp = createApp({
|
||||
},
|
||||
setSelectedTheme(theme) {
|
||||
this.selectedTheme = theme;
|
||||
document.body.setAttribute('data-theme', theme);
|
||||
document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto');
|
||||
StorageUtils.setTheme(theme);
|
||||
},
|
||||
newConversation() {
|
||||
@@ -513,6 +559,17 @@ const mainApp = createApp({
|
||||
fetchMessages() {
|
||||
this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
|
||||
},
|
||||
|
||||
// debug functions
|
||||
async debugImportDemoConv() {
|
||||
const res = await fetch('/demo-conversation.json');
|
||||
const demoConv = await res.json();
|
||||
StorageUtils.remove(demoConv.id);
|
||||
for (const msg of demoConv.messages) {
|
||||
StorageUtils.appendMsg(demoConv.id, msg);
|
||||
}
|
||||
this.fetchConversation();
|
||||
}
|
||||
},
|
||||
});
|
||||
mainApp.config.errorHandler = alert;
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
.markdown {
|
||||
h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
|
||||
pre {
|
||||
@apply whitespace-pre-wrap rounded-lg p-2;
|
||||
border: 1px solid currentColor;
|
||||
}
|
||||
/* TODO: fix markdown table */
|
||||
}
|
||||
|
||||
.show-on-hover {
|
||||
@apply md:opacity-0 md:group-hover:opacity-100;
|
||||
}
|
||||
.btn-mini {
|
||||
@apply cursor-pointer hover:shadow-md;
|
||||
}
|
||||
.chat-screen { max-width: 900px; }
|
||||
|
||||
.chat-bubble-base-300 {
|
||||
--tw-bg-opacity: 1;
|
||||
--tw-text-opacity: 1;
|
||||
@apply bg-base-300 text-base-content;
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
@use "sass:meta";
|
||||
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
.markdown {
|
||||
h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
|
||||
pre {
|
||||
@apply whitespace-pre-wrap rounded-lg p-2;
|
||||
border: 1px solid currentColor;
|
||||
}
|
||||
/* TODO: fix markdown table */
|
||||
}
|
||||
|
||||
.show-on-hover {
|
||||
@apply md:opacity-0 md:group-hover:opacity-100;
|
||||
}
|
||||
.btn-mini {
|
||||
@apply cursor-pointer hover:shadow-md;
|
||||
}
|
||||
.chat-screen { max-width: 900px; }
|
||||
|
||||
.chat-bubble-base-300 {
|
||||
--tw-bg-opacity: 1;
|
||||
--tw-text-opacity: 1;
|
||||
@apply bg-base-300 text-base-content;
|
||||
}
|
||||
|
||||
/* Highlight.js */
|
||||
[data-color-scheme='light'] {
|
||||
@include meta.load-css('highlight.js/styles/stackoverflow-light');
|
||||
}
|
||||
[data-color-scheme='dark'] {
|
||||
@include meta.load-css('highlight.js/styles/stackoverflow-dark');
|
||||
}
|
||||
[data-color-scheme='auto'] {
|
||||
@media (prefers-color-scheme: light) {
|
||||
@include meta.load-css('highlight.js/styles/stackoverflow-light');
|
||||
}
|
||||
@media (prefers-color-scheme: dark) {
|
||||
@include meta.load-css('highlight.js/styles/stackoverflow-dark');
|
||||
}
|
||||
}
|
||||
.hljs {
|
||||
background: transparent !important;
|
||||
padding: 0.5em !important;
|
||||
}
|
||||
@@ -2,6 +2,9 @@
|
||||
import { viteSingleFile } from 'vite-plugin-singlefile';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import zlib from 'zlib';
|
||||
|
||||
const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
|
||||
|
||||
const GUIDE_FOR_FRONTEND = `
|
||||
<!--
|
||||
@@ -12,25 +15,45 @@ const GUIDE_FOR_FRONTEND = `
|
||||
-->
|
||||
`.trim();
|
||||
|
||||
export default {
|
||||
plugins: [
|
||||
viteSingleFile(),
|
||||
(function llamaCppPlugin() {
|
||||
let config;
|
||||
return {
|
||||
name: 'llamacpp:build',
|
||||
apply: 'build',
|
||||
async configResolved(_config) {
|
||||
config = _config;
|
||||
},
|
||||
writeBundle() {
|
||||
const outputIndexHtml = path.join(config.build.outDir, 'index.html');
|
||||
const content = fs.readFileSync(outputIndexHtml, 'utf-8');
|
||||
const BUILD_PLUGINS = [
|
||||
viteSingleFile(),
|
||||
(function llamaCppPlugin() {
|
||||
let config;
|
||||
return {
|
||||
name: 'llamacpp:build',
|
||||
apply: 'build',
|
||||
async configResolved(_config) {
|
||||
config = _config;
|
||||
},
|
||||
writeBundle() {
|
||||
const outputIndexHtml = path.join(config.build.outDir, 'index.html');
|
||||
const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
|
||||
const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
|
||||
|
||||
const targetOutputFile = path.join(config.build.outDir, '../../public/index.html');
|
||||
fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content);
|
||||
// because gzip header contains machine-specific info, we must remove these data from the header
|
||||
// timestamp
|
||||
compressed[0x4] = 0;
|
||||
compressed[0x5] = 0;
|
||||
compressed[0x6] = 0;
|
||||
compressed[0x7] = 0;
|
||||
// OS
|
||||
compressed[0x9] = 0;
|
||||
|
||||
if (compressed.byteLength > MAX_BUNDLE_SIZE) {
|
||||
throw new Error(
|
||||
`Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
|
||||
`Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`,
|
||||
);
|
||||
}
|
||||
|
||||
const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz');
|
||||
fs.writeFileSync(targetOutputFile, compressed);
|
||||
}
|
||||
})(),
|
||||
],
|
||||
}
|
||||
})(),
|
||||
];
|
||||
|
||||
/** @type {import('vite').UserConfig} */
|
||||
export default {
|
||||
plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS,
|
||||
};
|
||||
|
||||
@@ -534,7 +534,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
|
||||
hn->buffer_id = buffer_id;
|
||||
hn->offset = offset;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -179,7 +179,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
|
||||
@@ -394,8 +394,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
switch (op->op) {
|
||||
case GGML_OP_CPY:
|
||||
return
|
||||
op->type != GGML_TYPE_IQ3_XXS &&
|
||||
op->type != GGML_TYPE_IQ3_S &&
|
||||
op->type != GGML_TYPE_IQ2_XXS &&
|
||||
op->type != GGML_TYPE_IQ2_XS &&
|
||||
op->type != GGML_TYPE_IQ2_S &&
|
||||
op->type != GGML_TYPE_IQ1_S &&
|
||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
|
||||
@@ -551,6 +551,22 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// expose GGUF internals for test code
|
||||
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
|
||||
struct gguf_buf {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t offset;
|
||||
};
|
||||
GGML_API struct gguf_buf gguf_buf_init(size_t size);
|
||||
GGML_API void gguf_buf_free(struct gguf_buf buf);
|
||||
|
||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -245,6 +245,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
|
||||
// [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
|
||||
vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
|
||||
@@ -528,6 +529,13 @@ struct vk_op_pool2d_push_constants {
|
||||
int32_t p0; int32_t p1;
|
||||
};
|
||||
|
||||
struct vk_op_rwkv_wkv6_push_constants {
|
||||
uint32_t B;
|
||||
uint32_t T;
|
||||
uint32_t C;
|
||||
uint32_t H;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
@@ -1363,7 +1371,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
|
||||
// Needs to be kept up to date on shader changes
|
||||
const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1;
|
||||
const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
|
||||
const uint32_t warps = warptile[0] / device->subgroup_size;
|
||||
const uint32_t warps = warptile[0] / warptile[10];
|
||||
|
||||
const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
|
||||
const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
|
||||
@@ -1377,8 +1385,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
std::cerr << "ggml_vulkan: Compiling shaders";
|
||||
|
||||
// some shaders require the subgroup size to be 16 or larger
|
||||
// some shaders have a minimum subgroup size
|
||||
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
|
||||
const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
|
||||
|
||||
// mulmat
|
||||
std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
|
||||
@@ -1445,7 +1454,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size };
|
||||
m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size };
|
||||
s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
|
||||
s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
|
||||
|
||||
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
|
||||
m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 };
|
||||
@@ -1864,7 +1873,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
@@ -1878,7 +1887,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||
@@ -1892,7 +1901,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
|
||||
|
||||
// dequant shaders
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
@@ -2014,6 +2023,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
@@ -5022,6 +5033,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_pool2d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_rwkv_wkv6_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
@@ -5424,6 +5440,134 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, bool dryrun = false) {
|
||||
const ggml_tensor * k = dst->src[0];
|
||||
const ggml_tensor * v = dst->src[1];
|
||||
const ggml_tensor * r = dst->src[2];
|
||||
const ggml_tensor * tf = dst->src[3];
|
||||
const ggml_tensor * td = dst->src[4];
|
||||
const ggml_tensor * state = dst->src[5];
|
||||
|
||||
GGML_ASSERT(!ggml_is_quantized(k->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(v->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(r->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(tf->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(td->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(state->type));
|
||||
GGML_ASSERT(dst->buffer != nullptr);
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, k, v, r, dst, GGML_OP_RWKV_WKV6);
|
||||
GGML_ASSERT(pipeline != nullptr);
|
||||
|
||||
if (dryrun) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
|
||||
ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
|
||||
ggml_backend_vk_buffer_context * r_buf_ctx = (ggml_backend_vk_buffer_context *)r->buffer->context;
|
||||
ggml_backend_vk_buffer_context * tf_buf_ctx = (ggml_backend_vk_buffer_context *)tf->buffer->context;
|
||||
ggml_backend_vk_buffer_context * td_buf_ctx = (ggml_backend_vk_buffer_context *)td->buffer->context;
|
||||
ggml_backend_vk_buffer_context * state_buf_ctx = (ggml_backend_vk_buffer_context *)state->buffer->context;
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
||||
vk_buffer d_D, d_K, d_V, d_R, d_TF, d_TD, d_State;
|
||||
uint64_t k_offset, v_offset, r_offset, tf_offset, td_offset, state_offset, dst_offset;
|
||||
bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, k->data, d_K, k_offset);
|
||||
ggml_vk_host_get(ctx->device, v->data, d_V, v_offset);
|
||||
ggml_vk_host_get(ctx->device, r->data, d_R, r_offset);
|
||||
ggml_vk_host_get(ctx->device, tf->data, d_TF, tf_offset);
|
||||
ggml_vk_host_get(ctx->device, td->data, d_TD, td_offset);
|
||||
ggml_vk_host_get(ctx->device, state->data, d_State, state_offset);
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
|
||||
|
||||
K_uma = d_K != nullptr;
|
||||
V_uma = d_V != nullptr;
|
||||
R_uma = d_R != nullptr;
|
||||
TF_uma = d_TF != nullptr;
|
||||
TD_uma = d_TD != nullptr;
|
||||
STATE_uma = d_State != nullptr;
|
||||
DST_uma = d_D != nullptr;
|
||||
}
|
||||
|
||||
if (!K_uma) {
|
||||
d_K = k_buf_ctx->dev_buffer;
|
||||
k_offset = vk_tensor_offset(k) + k->view_offs;
|
||||
}
|
||||
if (!V_uma) {
|
||||
d_V = v_buf_ctx->dev_buffer;
|
||||
v_offset = vk_tensor_offset(v) + v->view_offs;
|
||||
}
|
||||
if (!R_uma) {
|
||||
d_R = r_buf_ctx->dev_buffer;
|
||||
r_offset = vk_tensor_offset(r) + r->view_offs;
|
||||
}
|
||||
if (!TF_uma) {
|
||||
d_TF = tf_buf_ctx->dev_buffer;
|
||||
tf_offset = vk_tensor_offset(tf) + tf->view_offs;
|
||||
}
|
||||
if (!TD_uma) {
|
||||
d_TD = td_buf_ctx->dev_buffer;
|
||||
td_offset = vk_tensor_offset(td) + td->view_offs;
|
||||
}
|
||||
if (!STATE_uma) {
|
||||
d_State = state_buf_ctx->dev_buffer;
|
||||
state_offset = vk_tensor_offset(state) + state->view_offs;
|
||||
}
|
||||
if (!DST_uma) {
|
||||
d_D = dst_buf_ctx->dev_buffer;
|
||||
dst_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
}
|
||||
|
||||
const uint64_t k_size = ggml_nbytes(k);
|
||||
const uint64_t v_size = ggml_nbytes(v);
|
||||
const uint64_t r_size = ggml_nbytes(r);
|
||||
const uint64_t tf_size = ggml_nbytes(tf);
|
||||
const uint64_t td_size = ggml_nbytes(td);
|
||||
const uint64_t state_size = ggml_nbytes(state);
|
||||
const uint64_t dst_size = ggml_nbytes(dst);
|
||||
|
||||
std::array<uint32_t, 3> elements = {
|
||||
(uint32_t)(pc.B * pc.H),
|
||||
1,
|
||||
1
|
||||
};
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_K, k_offset, k_size },
|
||||
vk_subbuffer{ d_V, v_offset, v_size },
|
||||
vk_subbuffer{ d_R, r_offset, r_size },
|
||||
vk_subbuffer{ d_TF, tf_offset, tf_size },
|
||||
vk_subbuffer{ d_TD, td_offset, td_size },
|
||||
vk_subbuffer{ d_State, state_offset, state_size },
|
||||
vk_subbuffer{ d_D, dst_offset, dst_size }
|
||||
}, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
|
||||
const size_t seq_length = dst->src[0]->ne[3];
|
||||
const size_t n_embed = dst->ne[0];
|
||||
const size_t n_heads = dst->src[0]->ne[2];
|
||||
const size_t n_seqs = dst->src[5]->ne[1];
|
||||
|
||||
ggml_vk_op_f32_rwkv6(
|
||||
ctx, subctx, dst,
|
||||
{
|
||||
(uint32_t)n_seqs,
|
||||
(uint32_t)seq_length,
|
||||
(uint32_t)n_embed,
|
||||
(uint32_t)n_heads,
|
||||
},
|
||||
dryrun
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
int * op_params = (int *)dst->op_params;
|
||||
|
||||
@@ -6569,6 +6713,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
break;
|
||||
@@ -6768,6 +6913,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun);
|
||||
|
||||
break;
|
||||
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun);
|
||||
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -6848,6 +6998,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_REPEAT:
|
||||
buf = tensor->buffer;
|
||||
@@ -7724,6 +7875,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return true;
|
||||
default:
|
||||
@@ -8300,7 +8452,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||
const float * op_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
||||
} else {
|
||||
} else if (tensor->op == GGML_OP_RWKV_WKV6) {
|
||||
tensor_clone = ggml_rwkv_wkv6(ggml_ctx, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3],
|
||||
tensor->src[4], tensor->src[5]);
|
||||
}
|
||||
else {
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
|
||||
void soft_max(uint num_iters) {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint rowy = rowx % p.KY;
|
||||
const uint rowy = (p.KY > 0) ? (rowx % p.KY) : 0;
|
||||
|
||||
if (rowx >= p.nrows_x) {
|
||||
return;
|
||||
|
||||
@@ -479,6 +479,8 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#define BLOCK_SIZE 64
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(push_constant) uniform Parameters {
|
||||
uint B;
|
||||
uint T;
|
||||
uint C;
|
||||
uint H;
|
||||
};
|
||||
|
||||
layout(binding = 0) readonly buffer KBuf { A_TYPE k[]; };
|
||||
layout(binding = 1) readonly buffer VBuf { A_TYPE v[]; };
|
||||
layout(binding = 2) readonly buffer RBuf { A_TYPE r[]; };
|
||||
layout(binding = 3) readonly buffer TimeFBuf { A_TYPE tf[]; };
|
||||
layout(binding = 4) readonly buffer TimeDBuf { A_TYPE td[]; };
|
||||
layout(binding = 5) readonly buffer StateBuf { A_TYPE state_in[]; };
|
||||
layout(binding = 6) buffer DstBuf { A_TYPE dst[]; };
|
||||
|
||||
shared A_TYPE _k[BLOCK_SIZE], _r[BLOCK_SIZE], _tf[BLOCK_SIZE], _td[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint head_size = BLOCK_SIZE;
|
||||
const uint batch_id = gl_WorkGroupID.x / H;
|
||||
const uint head_id = gl_WorkGroupID.x % H;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
const uint state_size = C * head_size;
|
||||
const uint n_seq_tokens = T / B;
|
||||
|
||||
if (batch_id >= B || head_id >= H) {
|
||||
return;
|
||||
}
|
||||
|
||||
A_TYPE state[BLOCK_SIZE];
|
||||
[[unroll]] for (uint i = 0; i < head_size; i++) {
|
||||
state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
|
||||
+ i * head_size + tid];
|
||||
}
|
||||
|
||||
barrier();
|
||||
_tf[tid] = tf[head_id * head_size + tid];
|
||||
barrier();
|
||||
|
||||
const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
|
||||
const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
|
||||
|
||||
for (uint t = start_t; t < end_t; t += C) {
|
||||
barrier();
|
||||
_k[tid] = k[t];
|
||||
_r[tid] = r[t];
|
||||
_td[tid] = td[t];
|
||||
barrier();
|
||||
|
||||
const A_TYPE v_val = v[t];
|
||||
A_TYPE y = 0.0;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < head_size; j += 4) {
|
||||
vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
|
||||
vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
|
||||
vec4 tf_vec = vec4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
|
||||
vec4 td_vec = vec4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
|
||||
vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]);
|
||||
|
||||
vec4 kv = k_vec * v_val;
|
||||
|
||||
vec4 temp = tf_vec * kv + s_vec;
|
||||
y += dot(r_vec, temp);
|
||||
|
||||
s_vec = s_vec * td_vec + kv;
|
||||
state[j] = s_vec.x;
|
||||
state[j+1] = s_vec.y;
|
||||
state[j+2] = s_vec.z;
|
||||
state[j+3] = s_vec.w;
|
||||
}
|
||||
|
||||
dst[t] = y;
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = 0; i < head_size; i++) {
|
||||
dst[T * C + batch_id * state_size + head_id * head_size * head_size
|
||||
+ i * head_size + tid] = state[i];
|
||||
}
|
||||
}
|
||||
+26
-43
@@ -6037,12 +6037,12 @@ struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, co
|
||||
|
||||
struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
|
||||
const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
|
||||
return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
|
||||
const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
|
||||
return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
|
||||
}
|
||||
|
||||
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||
@@ -6489,7 +6489,7 @@ struct gguf_context {
|
||||
void * data;
|
||||
};
|
||||
|
||||
static size_t gguf_type_size(enum gguf_type type) {
|
||||
size_t gguf_type_size(enum gguf_type type) {
|
||||
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
||||
return GGUF_TYPE_SIZE[type];
|
||||
}
|
||||
@@ -6617,13 +6617,7 @@ struct gguf_context * gguf_init_empty(void) {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||
FILE * file = ggml_fopen(fname, "rb");
|
||||
if (!file) {
|
||||
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
|
||||
// offset from start of file
|
||||
size_t offset = 0;
|
||||
|
||||
@@ -6636,7 +6630,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
||||
if (magic[i] != GGUF_MAGIC[i]) {
|
||||
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
||||
fclose(file);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -6647,7 +6640,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
|
||||
fclose(file);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -6665,7 +6657,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6678,7 +6669,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6688,12 +6678,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
{
|
||||
const uint64_t n_kv = ctx->header.n_kv;
|
||||
|
||||
ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
|
||||
if (!ctx->kv) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
if (n_kv > 0) {
|
||||
ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
|
||||
if (!ctx->kv) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < n_kv; ++i) {
|
||||
@@ -6740,7 +6731,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6748,7 +6738,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
|
||||
if (!kv->value.arr.data) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6760,7 +6749,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6768,7 +6756,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
|
||||
if (!kv->value.arr.data) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6799,7 +6786,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6810,7 +6796,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
|
||||
if (!ctx->infos) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6846,7 +6831,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6889,7 +6873,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// this tensor type support have been removed:
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type));
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6897,7 +6880,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
if (ne % ggml_blck_size(info->type) != 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6929,7 +6911,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
*params.ctx = ggml_init(pdata);
|
||||
if (*params.ctx == NULL) {
|
||||
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
@@ -6948,7 +6929,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
||||
fclose(file);
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
@@ -6987,7 +6967,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
|
||||
fclose(file);
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
@@ -6996,11 +6975,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ggml_set_no_alloc(ctx_data, params.no_alloc);
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
||||
FILE * file = ggml_fopen(fname, "rb");
|
||||
if (!file) {
|
||||
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct gguf_context * result = gguf_init_from_file_impl(file, params);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
|
||||
void gguf_free(struct gguf_context * ctx) {
|
||||
if (ctx == NULL) {
|
||||
return;
|
||||
@@ -7460,13 +7449,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
|
||||
// fwrite(val, sizeof(char), size, file);
|
||||
//}
|
||||
|
||||
struct gguf_buf {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t offset;
|
||||
};
|
||||
|
||||
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf buf = {
|
||||
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
|
||||
/*buf.size =*/ size,
|
||||
@@ -7476,7 +7459,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void gguf_buf_free(struct gguf_buf buf) {
|
||||
void gguf_buf_free(struct gguf_buf buf) {
|
||||
if (buf.data) {
|
||||
GGML_FREE(buf.data);
|
||||
}
|
||||
@@ -7514,7 +7497,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
|
||||
buf->offset += el_size;
|
||||
}
|
||||
|
||||
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
// write header
|
||||
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
||||
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
||||
|
||||
@@ -249,6 +249,7 @@ class MODEL_ARCH(IntEnum):
|
||||
OLMOE = auto()
|
||||
OPENELM = auto()
|
||||
ARCTIC = auto()
|
||||
DEEPSEEK = auto()
|
||||
DEEPSEEK2 = auto()
|
||||
CHATGLM = auto()
|
||||
BITNET = auto()
|
||||
@@ -412,6 +413,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.OLMOE: "olmoe",
|
||||
MODEL_ARCH.OPENELM: "openelm",
|
||||
MODEL_ARCH.ARCTIC: "arctic",
|
||||
MODEL_ARCH.DEEPSEEK: "deepseek",
|
||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||
MODEL_ARCH.CHATGLM: "chatglm",
|
||||
MODEL_ARCH.BITNET: "bitnet",
|
||||
@@ -1158,6 +1160,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -1380,6 +1405,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK2: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
|
||||
@@ -306,7 +306,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
|
||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
@@ -338,7 +338,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
|
||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
@@ -379,7 +379,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
|
||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.11.0"
|
||||
version = "0.13.0"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
||||
+5
-9
@@ -1139,16 +1139,12 @@ extern "C" {
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
||||
int32_t n_vocab, // llama_n_vocab()
|
||||
llama_token special_eos_id, // llama_token_eos()
|
||||
llama_token linefeed_id, // llama_token_nl()
|
||||
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat, // 1.0 = disabled
|
||||
float penalty_freq, // 0.0 = disabled
|
||||
float penalty_present, // 0.0 = disabled
|
||||
bool penalize_nl, // consider newlines as a repeatable token
|
||||
bool ignore_eos); // ignore the end-of-sequence token
|
||||
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat, // 1.0 = disabled
|
||||
float penalty_freq, // 0.0 = disabled
|
||||
float penalty_present); // 0.0 = disabled
|
||||
|
||||
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
||||
|
||||
@@ -20,11 +20,13 @@ if [ -n "$GGML_CUDA" ]; then
|
||||
cmake_opts="-DGGML_CUDA=ON"
|
||||
fi
|
||||
|
||||
dir="build-bench"
|
||||
|
||||
function run {
|
||||
rm -fr build > /dev/null
|
||||
cmake -B build -S . $cmake_opts > /dev/null
|
||||
cmake --build build -t llama-bench > /dev/null
|
||||
build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
rm -fr ${dir} > /dev/null
|
||||
cmake -B ${dir} -S . $cmake_opts > /dev/null
|
||||
cmake --build ${dir} -t llama-bench > /dev/null
|
||||
${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
}
|
||||
|
||||
git checkout $1 > /dev/null
|
||||
|
||||
@@ -1 +1 @@
|
||||
74d66b63eaf207a24f3e93bb922aba131cbf2906
|
||||
e6d93f40dffe8733d5d72f1d8fa6b3ca27ae899f
|
||||
|
||||
+35
-90
@@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
|
||||
// penalties
|
||||
|
||||
struct llama_sampler_penalties {
|
||||
const int32_t n_vocab;
|
||||
const llama_token special_eos_id;
|
||||
const llama_token linefeed_id;
|
||||
|
||||
const int32_t penalty_last_n;
|
||||
const float penalty_repeat;
|
||||
const float penalty_freq;
|
||||
const float penalty_present;
|
||||
|
||||
const bool penalize_nl;
|
||||
const bool ignore_eos;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
// a frequency map to count token occurrences
|
||||
std::unordered_map<llama_token, int> token_count;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
|
||||
@@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->token_count[token]++;
|
||||
|
||||
// if the ring buffer is full, remove the oldest token
|
||||
if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
|
||||
const auto old = ctx->prev.front();
|
||||
|
||||
ctx->token_count[old]--;
|
||||
if (ctx->token_count[old] == 0) {
|
||||
ctx->token_count.erase(old);
|
||||
}
|
||||
}
|
||||
|
||||
ctx->prev.push_back(token);
|
||||
|
||||
#if 0
|
||||
// sanity check
|
||||
std::unordered_map<llama_token, int> tmp;
|
||||
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
||||
tmp[ctx->prev.rat(i)]++;
|
||||
}
|
||||
|
||||
assert(ctx->token_count == tmp);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
||||
|
||||
if (ctx->ignore_eos) {
|
||||
assert(ctx->special_eos_id >= 0);
|
||||
|
||||
// optimistically check if the candidates are not yet sorted/shuffled/truncated
|
||||
if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
|
||||
cur_p->data[ctx->special_eos_id].logit = -INFINITY;
|
||||
} else {
|
||||
// else, search for the special EOS token
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].id == ctx->special_eos_id) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((ctx->penalty_last_n == 0) ||
|
||||
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool nl_found = false;
|
||||
size_t nl_idx = 0;
|
||||
float nl_logit = -INFINITY;
|
||||
if (!ctx->penalize_nl) {
|
||||
assert(ctx->linefeed_id >= 0);
|
||||
|
||||
// optimistically check if the candidates are not yet sorted/shuffled/truncated
|
||||
if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
|
||||
nl_found = true;
|
||||
nl_idx = ctx->linefeed_id;
|
||||
nl_logit = cur_p->data[ctx->linefeed_id].logit;
|
||||
} else {
|
||||
// else, search for the linefeed token
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].id == ctx->linefeed_id) {
|
||||
nl_found = true;
|
||||
nl_idx = i;
|
||||
nl_logit = cur_p->data[i].logit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a frequency map to count occurrences of each token in last_tokens
|
||||
// TODO: optimize this by maintaining the token count in the sampler context
|
||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
||||
llama_token_cnt token_count;
|
||||
|
||||
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
||||
token_count[ctx->prev.rat(i)]++;
|
||||
}
|
||||
|
||||
// Apply frequency and presence penalties to the cur_p
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
const auto token_iter = token_count.find(cur_p->data[i].id);
|
||||
if (token_iter == token_count.end()) {
|
||||
const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
|
||||
if (token_iter == ctx->token_count.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int count = token_iter->second;
|
||||
|
||||
assert(count > 0 && count <= ctx->penalty_last_n);
|
||||
|
||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||
if (cur_p->data[i].logit <= 0) {
|
||||
@@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
|
||||
}
|
||||
|
||||
cur_p->sorted = false;
|
||||
|
||||
if (!ctx->penalize_nl && nl_found) {
|
||||
// restore the logit of the newline token if it was penalized
|
||||
cur_p->data[nl_idx].logit = nl_logit;
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
||||
ctx->prev.clear();
|
||||
ctx->token_count.clear();
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
|
||||
auto * result = llama_sampler_init_penalties(
|
||||
ctx->n_vocab,
|
||||
ctx->special_eos_id,
|
||||
ctx->linefeed_id,
|
||||
ctx->penalty_last_n,
|
||||
ctx->penalty_repeat,
|
||||
ctx->penalty_freq,
|
||||
ctx->penalty_present,
|
||||
ctx->penalize_nl,
|
||||
ctx->ignore_eos);
|
||||
ctx->penalty_present);
|
||||
|
||||
// copy the state
|
||||
{
|
||||
@@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_penalties(
|
||||
int32_t n_vocab,
|
||||
llama_token special_eos_id,
|
||||
llama_token linefeed_id,
|
||||
int32_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present,
|
||||
bool penalize_nl,
|
||||
bool ignore_eos) {
|
||||
if (linefeed_id == LLAMA_TOKEN_NULL) {
|
||||
penalize_nl = true;
|
||||
}
|
||||
|
||||
if (special_eos_id == LLAMA_TOKEN_NULL) {
|
||||
ignore_eos = false;
|
||||
}
|
||||
|
||||
float penalty_present) {
|
||||
penalty_last_n = std::max(penalty_last_n, 0);
|
||||
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_penalties_i,
|
||||
/* .ctx = */ new llama_sampler_penalties {
|
||||
/* .n_vocab = */ n_vocab,
|
||||
/* .special_eos_id = */ special_eos_id,
|
||||
/* .linefeed_id = */ linefeed_id,
|
||||
/* .penalty_last_n = */ penalty_last_n,
|
||||
/* .penalty_repeat = */ penalty_repeat,
|
||||
/* .penalty_freq = */ penalty_freq,
|
||||
/* .penalty_present = */ penalty_present,
|
||||
/* .penalize_nl = */ penalize_nl,
|
||||
/* .ignore_eos = */ ignore_eos,
|
||||
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
||||
/* .token_count = */ {},
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
||||
if (word.find(str) != std::string::npos) {
|
||||
token_sequences.emplace(token_id, std::vector<llama_token>());
|
||||
} else {
|
||||
size_t word_len = word.size(), str_len = str.size();
|
||||
size_t word_len = word.size();
|
||||
size_t str_len = str.size();
|
||||
size_t pos = -1;
|
||||
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
|
||||
bool match = true;
|
||||
|
||||
+1
-1
@@ -738,7 +738,7 @@ struct llm_tokenizer_wpm_session {
|
||||
std::vector<std::string> words(1, "");
|
||||
|
||||
for (const uint32_t cpt : cpts_nfd) {
|
||||
const auto flags = unicode_cpt_flags(cpt);
|
||||
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
||||
|
||||
if (flags.is_whitespace) {
|
||||
if (words.back().size()) { // finish previous word if any
|
||||
|
||||
+297
-8
@@ -184,6 +184,7 @@ enum llm_arch {
|
||||
LLM_ARCH_OLMOE,
|
||||
LLM_ARCH_OPENELM,
|
||||
LLM_ARCH_ARCTIC,
|
||||
LLM_ARCH_DEEPSEEK,
|
||||
LLM_ARCH_DEEPSEEK2,
|
||||
LLM_ARCH_CHATGLM,
|
||||
LLM_ARCH_BITNET,
|
||||
@@ -239,6 +240,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_OLMOE, "olmoe" },
|
||||
{ LLM_ARCH_OPENELM, "openelm" },
|
||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||
{ LLM_ARCH_CHATGLM, "chatglm" },
|
||||
{ LLM_ARCH_BITNET, "bitnet" },
|
||||
@@ -1309,6 +1311,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DEEPSEEK,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DEEPSEEK2,
|
||||
{
|
||||
@@ -1600,6 +1629,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
LLM_CHAT_TEMPLATE_GRANITE,
|
||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -1631,6 +1661,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||
};
|
||||
|
||||
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||
@@ -6094,6 +6125,19 @@ static void llm_load_hparams(
|
||||
model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 28: model.type = e_model::MODEL_20B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
bool is_lite = (hparams.n_layer == 27);
|
||||
@@ -6440,6 +6484,7 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "phi-2" ||
|
||||
tokenizer_pre == "jina-es" ||
|
||||
tokenizer_pre == "jina-de" ||
|
||||
tokenizer_pre == "gigachat" ||
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-es" ||
|
||||
tokenizer_pre == "jina-v2-de" ||
|
||||
@@ -7091,6 +7136,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
|
||||
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
@@ -8865,6 +8917,55 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
{
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
const int64_t n_expert_shared = hparams.n_expert_shared;
|
||||
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (i < (int) hparams.n_layer_dense_lead) {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
} else {
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error("n_expert must be > 0");
|
||||
}
|
||||
if (n_expert_used == 0) {
|
||||
throw std::runtime_error("n_expert_used must be > 0");
|
||||
}
|
||||
|
||||
// MoE branch
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
const bool is_lite = (hparams.n_layer == 27);
|
||||
@@ -15219,6 +15320,161 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_deepseek() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
||||
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE branch
|
||||
ggml_tensor * moe_out =
|
||||
llm_build_moe_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, hparams.expert_weights_scale,
|
||||
cb, il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// FFN shared expert
|
||||
{
|
||||
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_deepseek2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
@@ -16906,6 +17162,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_arctic();
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
{
|
||||
result = llm.build_deepseek();
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
result = llm.build_deepseek2();
|
||||
@@ -20137,6 +20397,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
case LLM_ARCH_OLMO:
|
||||
case LLM_ARCH_ARCTIC:
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_CHATGLM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
@@ -22002,6 +22263,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
||||
} else if (tmpl_contains("<|start_of_role|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
||||
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
||||
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@@ -22325,6 +22588,32 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
||||
// GigaChat template
|
||||
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
||||
|
||||
// Handle system message if present
|
||||
if (has_system) {
|
||||
ss << "<s>" << chat[0]->content << "<|message_sep|>";
|
||||
} else {
|
||||
ss << "<s>";
|
||||
}
|
||||
|
||||
// Process remaining messages
|
||||
for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
|
||||
std::string role(chat[i]->role);
|
||||
if (role == "user") {
|
||||
ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
|
||||
<< "available functions<|role_sep|>[]<|message_sep|>";
|
||||
} else if (role == "assistant") {
|
||||
ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
|
||||
}
|
||||
}
|
||||
|
||||
// Add generation prompt if needed
|
||||
if (add_ass) {
|
||||
ss << "assistant<|role_sep|>";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
@@ -22344,15 +22633,15 @@ int32_t llama_chat_apply_template(
|
||||
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
||||
if (tmpl == nullptr) {
|
||||
GGML_ASSERT(model != nullptr);
|
||||
// load template from model
|
||||
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
||||
std::string template_key = "tokenizer.chat_template";
|
||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||
if (res < 0) {
|
||||
|
||||
// load template from model, if available
|
||||
const auto & it = model->gguf_kv.find("tokenizer.chat_template");
|
||||
if (it != model->gguf_kv.end() && it->second.size() > 0) {
|
||||
curr_tmpl = it->second;
|
||||
}
|
||||
else {
|
||||
// worst case: there is no information about template, we will use chatml by default
|
||||
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
||||
} else {
|
||||
curr_tmpl = std::string(model_template.data(), model_template.size());
|
||||
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+51
-51
@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
throw std::invalid_argument("failed to convert utf8 to codepoint");
|
||||
}
|
||||
|
||||
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
|
||||
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
|
||||
// std::vector<uint16_t> result;
|
||||
// if (/* 0x0000 <= cp && */ cp <= 0xffff) {
|
||||
// result.emplace_back(cp);
|
||||
// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
|
||||
// result.emplace_back(cpt);
|
||||
// return result;
|
||||
// }
|
||||
// if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
|
||||
// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
|
||||
// if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
||||
// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
|
||||
// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
|
||||
// return result;
|
||||
// }
|
||||
// throw std::invalid_argument("failed to convert codepoint to utf16");
|
||||
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
// return result;
|
||||
//}
|
||||
|
||||
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
||||
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
||||
static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
|
||||
std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
|
||||
|
||||
assert (unicode_ranges_flags.begin()[0].first == 0);
|
||||
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
|
||||
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
||||
// interface
|
||||
//
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp) {
|
||||
std::string unicode_cpt_to_utf8(uint32_t cpt) {
|
||||
std::string result;
|
||||
|
||||
if (/* 0x00 <= cp && */ cp <= 0x7f) {
|
||||
result.push_back(cp);
|
||||
if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
|
||||
result.push_back(cpt);
|
||||
return result;
|
||||
}
|
||||
if (0x80 <= cp && cp <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
if (0x80 <= cpt && cpt <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
if (0x800 <= cp && cp <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
if (0x800 <= cpt && cpt <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cpt >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cpt >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
return result;
|
||||
}
|
||||
|
||||
codepoint_flags unicode_cpt_flags(const uint32_t cp) {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
|
||||
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
||||
static const auto cpt_flags = unicode_cpt_flags_array();
|
||||
return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
|
||||
return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
|
||||
}
|
||||
|
||||
codepoint_flags unicode_cpt_flags(const std::string & utf8) {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
|
||||
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
||||
if (utf8.empty()) {
|
||||
return undef; // undefined
|
||||
}
|
||||
size_t offset = 0;
|
||||
return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
|
||||
return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
|
||||
}
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte) {
|
||||
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
||||
return map.at(utf8);
|
||||
}
|
||||
|
||||
uint32_t unicode_tolower(uint32_t cp) {
|
||||
uint32_t unicode_tolower(uint32_t cpt) {
|
||||
// binary search
|
||||
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
|
||||
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
|
||||
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
|
||||
return pair.first < value;
|
||||
});
|
||||
if (it != unicode_map_lowercase.end() && it->first == cp) {
|
||||
if (it != unicode_map_lowercase.end() && it->first == cpt) {
|
||||
return it->second;
|
||||
}
|
||||
return cp; // Return the original code point if no lowercase mapping is found
|
||||
return cpt; // Return the original code point if no lowercase mapping is found
|
||||
}
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||
// unicode categories
|
||||
static const std::map<std::string, int> k_ucat_enum = {
|
||||
{ "\\p{N}", codepoint_flags::NUMBER },
|
||||
{ "\\p{L}", codepoint_flags::LETTER },
|
||||
{ "\\p{P}", codepoint_flags::PUNCTUATION },
|
||||
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
||||
{ "\\p{L}", unicode_cpt_flags::LETTER },
|
||||
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
||||
};
|
||||
|
||||
static const std::map<int, int> k_ucat_cpt = {
|
||||
{ codepoint_flags::NUMBER, 0xD1 },
|
||||
{ codepoint_flags::LETTER, 0xD2 },
|
||||
{ codepoint_flags::PUNCTUATION, 0xD3 },
|
||||
{ unicode_cpt_flags::NUMBER, 0xD1 },
|
||||
{ unicode_cpt_flags::LETTER, 0xD2 },
|
||||
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
|
||||
};
|
||||
|
||||
static const std::map<int, std::string> k_ucat_map = {
|
||||
{ codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
|
||||
{ codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||
{ codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
|
||||
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||
};
|
||||
|
||||
// compute collapsed codepoints only if needed by at least one regex
|
||||
bool need_collapse = false;
|
||||
for (auto & regex_expr : regex_exprs) {
|
||||
for (const auto & regex_expr : regex_exprs) {
|
||||
// search for unicode categories
|
||||
for (const auto & ucat : k_ucat_enum) {
|
||||
if (std::string::npos != regex_expr.find(ucat.first)) {
|
||||
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto flags = unicode_cpt_flags(cpts[i]);
|
||||
const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
|
||||
|
||||
if (flags.is_whitespace) {
|
||||
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
||||
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
|
||||
std::vector<size_t> bpe_offsets = { cpts.size() };
|
||||
|
||||
for (auto & regex_expr : regex_exprs) {
|
||||
for (const auto & regex_expr : regex_exprs) {
|
||||
// first, see if we have an efficient custom regex implementation
|
||||
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
|
||||
|
||||
@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
|
||||
// with the corresponding collapsed representation
|
||||
bool use_collapsed = false;
|
||||
for (auto & ucat : k_ucat_enum) {
|
||||
for (const auto & ucat : k_ucat_enum) {
|
||||
if (std::string::npos != regex_expr.find(ucat.first)) {
|
||||
use_collapsed = true;
|
||||
break;
|
||||
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
|
||||
std::wstring wtext(cpts.begin(), cpts.end());
|
||||
for (size_t i = 0; i < wtext.size(); ++i) {
|
||||
if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
|
||||
if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
|
||||
wtext[i] = 0x0B;
|
||||
}
|
||||
}
|
||||
|
||||
+9
-10
@@ -4,9 +4,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// TODO: prefix all symbols with "llama_"
|
||||
|
||||
struct codepoint_flags {
|
||||
struct unicode_cpt_flags {
|
||||
enum {
|
||||
UNDEFINED = 0x0001,
|
||||
NUMBER = 0x0002, // regex: \p{N}
|
||||
@@ -35,7 +33,7 @@ struct codepoint_flags {
|
||||
uint16_t is_nfd : 1;
|
||||
|
||||
// decode from uint16
|
||||
inline codepoint_flags(const uint16_t flags=0) {
|
||||
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
||||
*reinterpret_cast<uint16_t*>(this) = flags;
|
||||
}
|
||||
|
||||
@@ -50,18 +48,19 @@ struct codepoint_flags {
|
||||
|
||||
size_t unicode_len_utf8(char src);
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
std::string unicode_cpt_to_utf8 (uint32_t cpt);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||
|
||||
codepoint_flags unicode_cpt_flags(const uint32_t cp);
|
||||
codepoint_flags unicode_cpt_flags(const std::string & utf8);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||
|
||||
uint32_t unicode_tolower(uint32_t cp);
|
||||
uint32_t unicode_tolower(uint32_t cpt);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
|
||||
@@ -129,6 +129,7 @@ llama_target_and_test(test-arg-parser.cpp)
|
||||
llama_target_and_test(test-chat-template.cpp)
|
||||
|
||||
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||
llama_target_and_test(test-gguf.cpp)
|
||||
llama_target_and_test(test-backend-ops.cpp)
|
||||
|
||||
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||
|
||||
@@ -3549,8 +3549,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||
}
|
||||
}
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
|
||||
@@ -75,6 +75,8 @@ int main(void) {
|
||||
"{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + eos_token}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n",
|
||||
// mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
|
||||
"{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
||||
// ai-sage/GigaChat-20B-A3B-instruct
|
||||
"{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}",
|
||||
};
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
@@ -129,6 +131,8 @@ int main(void) {
|
||||
"[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST] I am an assistant </s>[INST]Another question[/INST]",
|
||||
// mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)
|
||||
"[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant </s>[INST] Another question[/INST]",
|
||||
// ai-sage/GigaChat-20B-A3B-instruct
|
||||
"<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
@@ -190,6 +194,7 @@ int main(void) {
|
||||
assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
|
||||
assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message
|
||||
assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
|
||||
assert(fmt_sys("gigachat") == "<s>You are a helpful assistant<|message_sep|>");
|
||||
|
||||
|
||||
// test llama_chat_format_single for user message
|
||||
@@ -214,6 +219,7 @@ int main(void) {
|
||||
assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates
|
||||
assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
|
||||
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
|
||||
assert(fmt_single("gigachat") == "user<|role_sep|>How are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>");
|
||||
|
||||
printf("Test chat templates: OK\n");
|
||||
|
||||
|
||||
+1303
File diff suppressed because it is too large
Load Diff
@@ -145,7 +145,7 @@ static void test_penalties(
|
||||
sampler_tester tester(probs, probs_expected);
|
||||
|
||||
const size_t n_vocab = probs.size();
|
||||
auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
|
||||
auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
|
||||
|
||||
for (size_t i = 0; i < last_tokens.size(); i++) {
|
||||
llama_sampler_accept(sampler, last_tokens[i]);
|
||||
|
||||
Reference in New Issue
Block a user