mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 18:47:39 +02:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 14278f55d2 | |||
| b1de96824b | |||
| 7ad554f90e | |||
| 5ee99c32f5 | |||
| c145f8a132 | |||
| 689a091bbe | |||
| f3f28c5395 | |||
| e75c6279d1 | |||
| 36376abe05 | |||
| 66c1968f7a | |||
| 1dcc3fde00 | |||
| 5d3de51f97 | |||
| fc0c8d286a |
+1
-1
@@ -526,7 +526,7 @@ if (LLAMA_SYCL)
|
||||
|
||||
message(STATUS "SYCL found")
|
||||
|
||||
add_compile_definitions(GML_USE_SYCL)
|
||||
add_compile_definitions(GGML_USE_SYCL)
|
||||
|
||||
if (LLAMA_SYCL_F16)
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
|
||||
@@ -768,7 +768,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
|
||||
|
||||
#### How to run
|
||||
|
||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||
3. Output:
|
||||
```
|
||||
|
||||
@@ -219,7 +219,7 @@ function gg_run_open_llama_3b_v2 {
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||
|
||||
@@ -401,7 +401,7 @@ function gg_run_open_llama_7b_v2 {
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
|
||||
path_models="../models-mnt/open-llama/7B-v2"
|
||||
|
||||
+5
-3
@@ -1704,6 +1704,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
}
|
||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||
@@ -1741,7 +1742,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
|
||||
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
||||
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
||||
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
||||
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||
@@ -1750,7 +1751,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||
|
||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||
@@ -1801,7 +1802,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||
if (cs_curr[j] < 0) { continue; }
|
||||
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||
seqs[cs_curr[j]] = seqs.size();
|
||||
const size_t sz = seqs.size();
|
||||
seqs[cs_curr[j]] = sz;
|
||||
}
|
||||
}
|
||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||
|
||||
+2
-2
@@ -121,7 +121,7 @@ static void sampler_queue(
|
||||
struct llama_context * ctx_main,
|
||||
const llama_sampling_params & params,
|
||||
llama_token_data_array & cur_p,
|
||||
size_t & min_keep) {
|
||||
size_t min_keep) {
|
||||
const float temp = params.temp;
|
||||
const float dynatemp_range = params.dynatemp_range;
|
||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||
@@ -249,7 +249,7 @@ static llama_token llama_sampling_sample_impl(
|
||||
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
||||
} else {
|
||||
// temperature sampling
|
||||
size_t min_keep = std::max(1, params.n_probs);
|
||||
size_t min_keep = std::max(1, params.min_keep);
|
||||
|
||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ enum class llama_sampler_type : char {
|
||||
typedef struct llama_sampling_params {
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
|
||||
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
|
||||
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
|
||||
@@ -325,14 +325,14 @@ struct train_params {
|
||||
};
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
||||
printf("%s: n_head: %d\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %d\n", __func__, params->n_ff);
|
||||
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
printf("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
printf("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
printf("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
static void init_model(struct my_llama_model * model) {
|
||||
@@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
|
||||
model->train_tokens = 0;
|
||||
|
||||
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||
printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
|
||||
printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
|
||||
|
||||
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
|
||||
printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
|
||||
|
||||
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
|
||||
|
||||
// printing the per-layer allocations here so we dont print in the for loop.
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
|
||||
printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
|
||||
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
|
||||
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
|
||||
ggml_set_name(model->norm, "norm.weight");
|
||||
|
||||
@@ -53,7 +53,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
|
||||
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./convert.py ../llava-v1.5-7b
|
||||
python ./convert.py ../llava-v1.5-7b --skip-unknown
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||
|
||||
@@ -19,10 +19,6 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
torch.save(projector, f"{args.model}/llava.projector")
|
||||
|
||||
# remove these tensors from the checkpoint and save it again
|
||||
for name in mm_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# BakLLaVA models contain CLIP tensors in it
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
|
||||
if len(clip_tensors) > 0:
|
||||
@@ -39,7 +35,7 @@ if len(clip_tensors) > 0:
|
||||
f.write("{}\n")
|
||||
|
||||
|
||||
torch.save(checkpoint, path)
|
||||
torch.save(checkpoint, path)
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
|
||||
@@ -309,7 +309,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
|
||||
}
|
||||
|
||||
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
@@ -447,7 +447,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
return perplexity_v2(ctx, params);
|
||||
}
|
||||
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
@@ -1623,7 +1623,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
uint32_t n_ctx;
|
||||
in.read((char *)&n_ctx, sizeof(n_ctx));
|
||||
if (n_ctx > llama_n_ctx(ctx)) {
|
||||
fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
|
||||
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
||||
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
||||
}
|
||||
|
||||
|
||||
@@ -39,6 +39,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
|
||||
## Build
|
||||
|
||||
@@ -135,6 +137,7 @@ node index.js
|
||||
- `{"status": "loading model"}` if the model is still being loaded.
|
||||
- `{"status": "error"}` if the model failed to load.
|
||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
|
||||
|
||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||
|
||||
@@ -196,6 +199,8 @@ node index.js
|
||||
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||
|
||||
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
|
||||
|
||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||
|
||||
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
||||
@@ -379,6 +384,69 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
}'
|
||||
```
|
||||
|
||||
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
|
||||
|
||||
### Result JSON
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"dynatemp_exponent": 1.0,
|
||||
"dynatemp_range": 0.0,
|
||||
"frequency_penalty": 0.0,
|
||||
"grammar": "",
|
||||
"id": 0,
|
||||
"ignore_eos": false,
|
||||
"logit_bias": [],
|
||||
"min_p": 0.05000000074505806,
|
||||
"mirostat": 0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"mirostat_tau": 5.0,
|
||||
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
|
||||
"n_ctx": 2048,
|
||||
"n_keep": 0,
|
||||
"n_predict": 100000,
|
||||
"n_probs": 0,
|
||||
"next_token": {
|
||||
"has_next_token": true,
|
||||
"n_remain": -1,
|
||||
"num_tokens_predicted": 0,
|
||||
"stopped_eos": false,
|
||||
"stopped_limit": false,
|
||||
"stopped_word": false,
|
||||
"stopping_word": ""
|
||||
},
|
||||
"penalize_nl": true,
|
||||
"penalty_prompt_tokens": [],
|
||||
"presence_penalty": 0.0,
|
||||
"prompt": "Say hello to llama.cpp",
|
||||
"repeat_last_n": 64,
|
||||
"repeat_penalty": 1.100000023841858,
|
||||
"samplers": [
|
||||
"top_k",
|
||||
"tfs_z",
|
||||
"typical_p",
|
||||
"top_p",
|
||||
"min_p",
|
||||
"temperature"
|
||||
],
|
||||
"seed": 42,
|
||||
"state": 1,
|
||||
"stop": [
|
||||
"\n"
|
||||
],
|
||||
"stream": false,
|
||||
"task_id": 0,
|
||||
"temperature": 0.0,
|
||||
"tfs_z": 1.0,
|
||||
"top_k": 40,
|
||||
"top_p": 0.949999988079071,
|
||||
"typical_p": 1.0,
|
||||
"use_penalty_prompt_tokens": false
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
||||
@@ -234,6 +234,7 @@
|
||||
mirostat_eta: 0.1, // learning rate
|
||||
grammar: '',
|
||||
n_probs: 0, // no completion_probabilities,
|
||||
min_keep: 0, // min probs from each sampler,
|
||||
image_data: [],
|
||||
cache_prompt: true,
|
||||
api_key: ''
|
||||
@@ -791,6 +792,9 @@
|
||||
<fieldset>
|
||||
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<label for="api_key">API Key</label>
|
||||
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <atomic>
|
||||
#include <signal.h>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
@@ -40,6 +41,7 @@ struct server_params
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
bool slots_endpoint = true;
|
||||
};
|
||||
|
||||
bool server_verbose = false;
|
||||
@@ -158,6 +160,7 @@ struct llama_client_slot
|
||||
int32_t n_decoded = 0;
|
||||
int32_t n_remaining = -1;
|
||||
int32_t i_batch = -1;
|
||||
int32_t n_predict = -1;
|
||||
|
||||
int32_t num_prompt_tokens = 0;
|
||||
int32_t num_prompt_tokens_processed = 0;
|
||||
@@ -409,6 +412,7 @@ struct llama_server_context
|
||||
|
||||
slot.id = i;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
|
||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||
|
||||
@@ -544,6 +548,16 @@ struct llama_server_context
|
||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||
|
||||
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||
// Might be better to reject the request with a 400 ?
|
||||
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
||||
{"params.n_predict", slot->params.n_predict},
|
||||
{"slot.n_predict", slot->n_predict},
|
||||
});
|
||||
slot->params.n_predict = slot->n_predict;
|
||||
}
|
||||
|
||||
// infill
|
||||
if (data.count("input_prefix") != 0)
|
||||
@@ -1052,6 +1066,7 @@ struct llama_server_context
|
||||
|
||||
return json {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_predict", slot.n_predict},
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.params.seed},
|
||||
{"temperature", slot.sparams.temp},
|
||||
@@ -1079,6 +1094,7 @@ struct llama_server_context
|
||||
{"stream", slot.params.stream},
|
||||
{"logit_bias", slot.sparams.logit_bias},
|
||||
{"n_probs", slot.sparams.n_probs},
|
||||
{"min_keep", slot.sparams.min_keep},
|
||||
{"grammar", slot.sparams.grammar},
|
||||
{"samplers", samplers_sequence}
|
||||
};
|
||||
@@ -1913,14 +1929,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||
printf("\n");
|
||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf(" --chat-template FORMAT_NAME");
|
||||
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -2360,6 +2378,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
log_set_target(stdout);
|
||||
LOG_INFO("logging to file is disabled.", {});
|
||||
}
|
||||
else if (arg == "--slots-endpoint-disable")
|
||||
{
|
||||
sparams.slots_endpoint = false;
|
||||
}
|
||||
else if (arg == "--chat-template")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -2511,6 +2533,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
||||
}
|
||||
}
|
||||
|
||||
std::function<void(int)> shutdown_handler;
|
||||
inline void signal_handler(int signal) { shutdown_handler(signal); }
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
#if SERVER_VERBOSE != 1
|
||||
@@ -2561,8 +2586,35 @@ int main(int argc, char **argv)
|
||||
server_state current_state = state.load();
|
||||
switch(current_state) {
|
||||
case SERVER_STATE_READY:
|
||||
res.set_content(R"({"status": "ok"})", "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
if (llama.all_slots_are_idle) {
|
||||
res.set_content(R"({"status": "ok"})", "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
} else {
|
||||
int available_slots = 0;
|
||||
int processing_slots = 0;
|
||||
for (llama_client_slot & slot : llama.slots) {
|
||||
if (slot.available()) {
|
||||
available_slots++;
|
||||
} else {
|
||||
processing_slots++;
|
||||
}
|
||||
}
|
||||
if (available_slots > 0) {
|
||||
json health = {
|
||||
{"status", "ok"},
|
||||
{"slots_idle", available_slots},
|
||||
{"slots_processing", processing_slots}};
|
||||
res.set_content(health.dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
} else {
|
||||
json health = {
|
||||
{"status", "no slot available"},
|
||||
{"slots_idle", available_slots},
|
||||
{"slots_processing", processing_slots}};
|
||||
res.set_content(health.dump(), "application/json");
|
||||
res.status = 503; // HTTP Service Unavailable
|
||||
}
|
||||
}
|
||||
break;
|
||||
case SERVER_STATE_LOADING_MODEL:
|
||||
res.set_content(R"({"status": "loading model"})", "application/json");
|
||||
@@ -2575,6 +2627,32 @@ int main(int argc, char **argv)
|
||||
}
|
||||
});
|
||||
|
||||
if (sparams.slots_endpoint) {
|
||||
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
|
||||
json slots;
|
||||
for (llama_client_slot & slot : llama.slots) {
|
||||
json slot_data = llama.get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["task_id"] = slot.task_id;
|
||||
slot_data["state"] = slot.state;
|
||||
slot_data["prompt"] = slot.prompt;
|
||||
slot_data["next_token"] = {
|
||||
{"has_next_token", slot.has_next_token},
|
||||
{"n_remain", slot.n_remaining},
|
||||
{"num_tokens_predicted", slot.n_decoded},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
{"stopped_word", slot.stopped_word},
|
||||
{"stopped_limit", slot.stopped_limit},
|
||||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
|
||||
slots.push_back(slot_data);
|
||||
}
|
||||
res.set_content(slots.dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
});
|
||||
}
|
||||
|
||||
svr.set_logger(log_server_request);
|
||||
|
||||
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
||||
@@ -3128,8 +3206,25 @@ int main(int argc, char **argv)
|
||||
std::placeholders::_2,
|
||||
std::placeholders::_3
|
||||
));
|
||||
llama.queue_tasks.start_loop();
|
||||
|
||||
shutdown_handler = [&](int) {
|
||||
llama.queue_tasks.terminate();
|
||||
};
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
llama.queue_tasks.start_loop();
|
||||
svr.stop();
|
||||
t.join();
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
|
||||
struct llama_server_queue {
|
||||
int id = 0;
|
||||
std::mutex mutex_tasks;
|
||||
bool running;
|
||||
// queues
|
||||
std::vector<task_server> queue_tasks;
|
||||
std::vector<task_server> queue_tasks_deferred;
|
||||
@@ -278,9 +279,18 @@ struct llama_server_queue {
|
||||
queue_tasks_deferred.clear();
|
||||
}
|
||||
|
||||
// Start the main loop. This call is blocking
|
||||
[[noreturn]]
|
||||
// end the start_loop routine
|
||||
void terminate() {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
running = false;
|
||||
}
|
||||
condition_tasks.notify_all();
|
||||
}
|
||||
|
||||
// Start the main loop.
|
||||
void start_loop() {
|
||||
running = true;
|
||||
while (true) {
|
||||
// new task arrived
|
||||
LOG_VERBOSE("have new task", {});
|
||||
@@ -324,8 +334,12 @@ struct llama_server_queue {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (queue_tasks.empty()) {
|
||||
if (!running) {
|
||||
LOG_VERBOSE("ending start_loop", {});
|
||||
return;
|
||||
}
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return !queue_tasks.empty();
|
||||
return (!queue_tasks.empty() || !running);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
|
||||
static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||
printf("%s: n_head: %d\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %d\n", __func__, params->n_ff);
|
||||
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
printf("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
printf("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
static void set_param_model(struct my_llama_model * model) {
|
||||
|
||||
+18
-4
@@ -4027,7 +4027,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
(void) x;
|
||||
(void) y;
|
||||
(void) yl;
|
||||
(void) nb32;
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
@@ -4170,7 +4173,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
(void) x;
|
||||
(void) y;
|
||||
(void) yl;
|
||||
(void) nb32;
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
@@ -4306,7 +4312,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
(void) x;
|
||||
(void) y;
|
||||
(void) yl;
|
||||
(void) nb32;
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
@@ -4424,7 +4433,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
y4 += 16 * 32;
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
(void) x;
|
||||
(void) y;
|
||||
(void) yl;
|
||||
(void) nb32;
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
@@ -4659,6 +4671,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
#else
|
||||
(void) get_scale_min_k4_just2;
|
||||
|
||||
q = q + 16 * (il&1);
|
||||
device const uint8_t * s = xb->scales;
|
||||
device const half2 * dh = (device const half2 *)xb->d;
|
||||
|
||||
+41
-41
@@ -1837,9 +1837,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
||||
float sigma2 = sumx2/QK_K;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
||||
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
||||
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
||||
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
||||
for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
||||
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
||||
scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
||||
}
|
||||
|
||||
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
||||
@@ -3855,7 +3855,7 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -3866,8 +3866,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
assert(nrc == 1);
|
||||
#endif
|
||||
UNUSED(nrc);
|
||||
UNUSED(bbx);
|
||||
UNUSED(bby);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q4_0 * restrict x = vx;
|
||||
@@ -4024,15 +4024,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
|
||||
|
||||
__m128i bx = _mm_and_si128(lowMask, tmp);
|
||||
__m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
|
||||
bx = _mm_sub_epi8(bx, off);
|
||||
const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
|
||||
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
||||
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
||||
by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
||||
bx = _mm_sub_epi8(bx, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
|
||||
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
||||
by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
||||
@@ -4222,7 +4222,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -4233,8 +4233,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
assert(nrc == 1);
|
||||
#endif
|
||||
UNUSED(nrc);
|
||||
UNUSED(bbx);
|
||||
UNUSED(bby);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q4_1 * restrict x = vx;
|
||||
@@ -4440,7 +4440,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -4448,8 +4448,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
assert(qk == QK5_0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bbx);
|
||||
UNUSED(bby);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q5_0 * restrict x = vx;
|
||||
@@ -4618,21 +4618,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||
|
||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
||||
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
||||
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
||||
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
||||
bxhil = _mm_andnot_si128(bxhil, mask);
|
||||
bxhih = _mm_andnot_si128(bxhih, mask);
|
||||
__m128i bxl = _mm256_castsi256_si128(bx);
|
||||
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
||||
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
||||
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
||||
bxl = _mm_or_si128(bxl, bxhil);
|
||||
bxh = _mm_or_si128(bxh, bxhih);
|
||||
bx = MM256_SET_M128I(bxh, bxl);
|
||||
bx_0 = MM256_SET_M128I(bxh, bxl);
|
||||
|
||||
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
||||
const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
|
||||
|
||||
/* Multiply q with scale and accumulate */
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
||||
@@ -4731,7 +4731,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -4739,8 +4739,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
assert(qk == QK5_1);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bbx);
|
||||
UNUSED(bby);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q5_1 * restrict x = vx;
|
||||
@@ -4925,22 +4925,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
||||
|
||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
||||
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
||||
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
||||
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
||||
bxhil = _mm_and_si128(bxhil, mask);
|
||||
bxhih = _mm_and_si128(bxhih, mask);
|
||||
__m128i bxl = _mm256_castsi256_si128(bx);
|
||||
__m128i bxh = _mm256_extractf128_si256(bx, 1);
|
||||
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
||||
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
||||
bxl = _mm_or_si128(bxl, bxhil);
|
||||
bxh = _mm_or_si128(bxh, bxhih);
|
||||
bx = MM256_SET_M128I(bxh, bxl);
|
||||
bx_0 = MM256_SET_M128I(bxh, bxl);
|
||||
|
||||
const __m256 dy = _mm256_set1_ps(y[i].d);
|
||||
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
||||
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
||||
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
||||
}
|
||||
@@ -5035,7 +5035,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -5046,8 +5046,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
assert(nrc == 1);
|
||||
#endif
|
||||
UNUSED(nrc);
|
||||
UNUSED(bbx);
|
||||
UNUSED(bby);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q8_0 * restrict x = vx;
|
||||
@@ -5169,10 +5169,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// load elements
|
||||
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
||||
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
||||
vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
||||
vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
||||
|
||||
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
||||
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
|
||||
|
||||
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
||||
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
||||
|
||||
@@ -17909,7 +17909,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
||||
|
||||
ptr += ggml_nbytes(tensor);
|
||||
|
||||
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||
fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18012,7 +18012,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
||||
|
||||
result->nodes[i] = tensor;
|
||||
|
||||
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||
fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
|
||||
echo "Usage:"
|
||||
echo ""
|
||||
|
||||
@@ -38,8 +38,8 @@ term ::= [0-9]+)""";
|
||||
// pretty print error message before asserting
|
||||
if (expected_pair.first != key || expected_pair.second != value)
|
||||
{
|
||||
fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
|
||||
fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
|
||||
fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
|
||||
fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
|
||||
fprintf(stderr, "expected_pair != actual_pair\n");
|
||||
}
|
||||
|
||||
@@ -96,9 +96,9 @@ term ::= [0-9]+)""";
|
||||
// pretty print error message before asserting
|
||||
if (expected_element.type != element.type || expected_element.value != element.value)
|
||||
{
|
||||
fprintf(stderr, "index: %d\n", index);
|
||||
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
|
||||
fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
|
||||
fprintf(stderr, "index: %u\n", index);
|
||||
fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
|
||||
fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
|
||||
fprintf(stderr, "expected_element != actual_element\n");
|
||||
}
|
||||
|
||||
@@ -144,8 +144,8 @@ term ::= [0-9]+)""";
|
||||
// pretty print error message before asserting
|
||||
if (expected_pair.first != key || expected_pair.second != value)
|
||||
{
|
||||
fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
|
||||
fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
|
||||
fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
|
||||
fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
|
||||
fprintf(stderr, "expected_pair != actual_pair\n");
|
||||
}
|
||||
|
||||
@@ -235,9 +235,9 @@ term ::= [0-9]+)""";
|
||||
// pretty print error message before asserting
|
||||
if (expected_element.type != element.type || expected_element.value != element.value)
|
||||
{
|
||||
fprintf(stderr, "index: %d\n", index);
|
||||
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
|
||||
fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
|
||||
fprintf(stderr, "index: %u\n", index);
|
||||
fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
|
||||
fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
|
||||
fprintf(stderr, "expected_element != actual_element\n");
|
||||
}
|
||||
|
||||
|
||||
@@ -180,8 +180,8 @@ int main()
|
||||
if (expected_element.type != element->type || expected_element.value != element->value)
|
||||
{
|
||||
fprintf(stderr, "index: %d\n", index);
|
||||
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
|
||||
fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
|
||||
fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
|
||||
fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
|
||||
fprintf(stderr, "expected_element != actual_element\n");
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user