Compare commits

...

10 Commits

Author SHA1 Message Date
Georgi Gerganov 14278f55d2 ggml : restore vec dot stride arg names (#5453) 2024-02-18 22:58:57 +02:00
Georgi Gerganov b1de96824b ci : fix wikitext url + compile warnings (#5569)
ggml-ci
2024-02-18 22:39:30 +02:00
Georgi Gerganov 7ad554f90e metal : fix unused warnings (#0) 2024-02-18 21:39:58 +02:00
Robey Holderith 5ee99c32f5 common, server : surface min_keep as its own parameter (#5567)
* Feature - surface min_keep as its own parameter

* Updated README with min_keep param
2024-02-18 21:11:16 +02:00
Pierrick Hymbert c145f8a132 server : slots monitoring endpoint (#5550) 2024-02-18 19:39:57 +02:00
Georgi Gerganov 689a091bbe sampling : do not set min_keep to n_probs (#5564) 2024-02-18 19:38:06 +02:00
Georgi Gerganov f3f28c5395 cmake : fix GGML_USE_SYCL typo (#5555) 2024-02-18 19:17:00 +02:00
Pierrick Hymbert e75c6279d1 server : enhanced health endpoint (#5548)
* server: enrich health endpoint with available slots, return 503 if not slots are available

* server: document new status no slot available in the README.md
2024-02-18 18:31:28 +02:00
Pierrick Hymbert 36376abe05 server : --n-predict option document and cap to max value (#5549)
* server: document --n-predict

* server: ensure client request cannot override n_predict if set

* server: fix print usage LF in new --n-predict option
2024-02-18 18:30:09 +02:00
Daniel Hiltgen 66c1968f7a server : graceful server shutdown (#5244)
This updates the server queue to support graceful shutdown of the server on signals.
2024-02-18 18:23:16 +02:00
14 changed files with 258 additions and 61 deletions
+1 -1
View File
@@ -526,7 +526,7 @@ if (LLAMA_SYCL)
message(STATUS "SYCL found")
add_compile_definitions(GML_USE_SYCL)
add_compile_definitions(GGML_USE_SYCL)
if (LLAMA_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
+1 -1
View File
@@ -768,7 +768,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
#### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
```
+2 -2
View File
@@ -219,7 +219,7 @@ function gg_run_open_llama_3b_v2 {
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
@@ -401,7 +401,7 @@ function gg_run_open_llama_7b_v2 {
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2"
+1
View File
@@ -1704,6 +1704,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
+2 -2
View File
@@ -121,7 +121,7 @@ static void sampler_queue(
struct llama_context * ctx_main,
const llama_sampling_params & params,
llama_token_data_array & cur_p,
size_t & min_keep) {
size_t min_keep) {
const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
@@ -249,7 +249,7 @@ static llama_token llama_sampling_sample_impl(
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else {
// temperature sampling
size_t min_keep = std::max(1, params.n_probs);
size_t min_keep = std::max(1, params.min_keep);
sampler_queue(ctx_main, params, cur_p, min_keep);
+1
View File
@@ -22,6 +22,7 @@ enum class llama_sampler_type : char {
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
+2 -2
View File
@@ -309,7 +309,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
}
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
@@ -447,7 +447,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
return perplexity_v2(ctx, params);
}
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
+68
View File
@@ -39,6 +39,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
## Build
@@ -135,6 +137,7 @@ node index.js
- `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
@@ -196,6 +199,8 @@ node index.js
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
@@ -379,6 +384,69 @@ Notice that each `probs` is an array of length `n_probs`.
}'
```
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
### Result JSON
```json
[
{
"dynatemp_exponent": 1.0,
"dynatemp_range": 0.0,
"frequency_penalty": 0.0,
"grammar": "",
"id": 0,
"ignore_eos": false,
"logit_bias": [],
"min_p": 0.05000000074505806,
"mirostat": 0,
"mirostat_eta": 0.10000000149011612,
"mirostat_tau": 5.0,
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
"n_ctx": 2048,
"n_keep": 0,
"n_predict": 100000,
"n_probs": 0,
"next_token": {
"has_next_token": true,
"n_remain": -1,
"num_tokens_predicted": 0,
"stopped_eos": false,
"stopped_limit": false,
"stopped_word": false,
"stopping_word": ""
},
"penalize_nl": true,
"penalty_prompt_tokens": [],
"presence_penalty": 0.0,
"prompt": "Say hello to llama.cpp",
"repeat_last_n": 64,
"repeat_penalty": 1.100000023841858,
"samplers": [
"top_k",
"tfs_z",
"typical_p",
"top_p",
"min_p",
"temperature"
],
"seed": 42,
"state": 1,
"stop": [
"\n"
],
"stream": false,
"task_id": 0,
"temperature": 0.0,
"tfs_z": 1.0,
"top_k": 40,
"top_p": 0.949999988079071,
"typical_p": 1.0,
"use_penalty_prompt_tokens": false
}
]
```
## More examples
### Change system prompt on runtime
+4
View File
@@ -234,6 +234,7 @@
mirostat_eta: 0.1, // learning rate
grammar: '',
n_probs: 0, // no completion_probabilities,
min_keep: 0, // min probs from each sampler,
image_data: [],
cache_prompt: true,
api_key: ''
@@ -791,6 +792,9 @@
<fieldset>
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
</fieldset>
<fieldset>
${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
</fieldset>
<fieldset>
<label for="api_key">API Key</label>
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+99 -4
View File
@@ -28,6 +28,7 @@
#include <chrono>
#include <condition_variable>
#include <atomic>
#include <signal.h>
using json = nlohmann::json;
@@ -40,6 +41,7 @@ struct server_params
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool slots_endpoint = true;
};
bool server_verbose = false;
@@ -158,6 +160,7 @@ struct llama_client_slot
int32_t n_decoded = 0;
int32_t n_remaining = -1;
int32_t i_batch = -1;
int32_t n_predict = -1;
int32_t num_prompt_tokens = 0;
int32_t num_prompt_tokens_processed = 0;
@@ -409,6 +412,7 @@ struct llama_server_context
slot.id = i;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
@@ -544,6 +548,16 @@ struct llama_server_context
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
{"params.n_predict", slot->params.n_predict},
{"slot.n_predict", slot->n_predict},
});
slot->params.n_predict = slot->n_predict;
}
// infill
if (data.count("input_prefix") != 0)
@@ -1052,6 +1066,7 @@ struct llama_server_context
return json {
{"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict},
{"model", params.model_alias},
{"seed", slot.params.seed},
{"temperature", slot.sparams.temp},
@@ -1079,6 +1094,7 @@ struct llama_server_context
{"stream", slot.params.stream},
{"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence}
};
@@ -1913,14 +1929,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
printf("\n");
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf(" --chat-template FORMAT_NAME");
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf("\n");
}
@@ -2360,6 +2378,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
}
else if (arg == "--slots-endpoint-disable")
{
sparams.slots_endpoint = false;
}
else if (arg == "--chat-template")
{
if (++i >= argc)
@@ -2511,6 +2533,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
std::function<void(int)> shutdown_handler;
inline void signal_handler(int signal) { shutdown_handler(signal); }
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -2561,8 +2586,35 @@ int main(int argc, char **argv)
server_state current_state = state.load();
switch(current_state) {
case SERVER_STATE_READY:
res.set_content(R"({"status": "ok"})", "application/json");
res.status = 200; // HTTP OK
if (llama.all_slots_are_idle) {
res.set_content(R"({"status": "ok"})", "application/json");
res.status = 200; // HTTP OK
} else {
int available_slots = 0;
int processing_slots = 0;
for (llama_client_slot & slot : llama.slots) {
if (slot.available()) {
available_slots++;
} else {
processing_slots++;
}
}
if (available_slots > 0) {
json health = {
{"status", "ok"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 200; // HTTP OK
} else {
json health = {
{"status", "no slot available"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 503; // HTTP Service Unavailable
}
}
break;
case SERVER_STATE_LOADING_MODEL:
res.set_content(R"({"status": "loading model"})", "application/json");
@@ -2575,6 +2627,32 @@ int main(int argc, char **argv)
}
});
if (sparams.slots_endpoint) {
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
json slots;
for (llama_client_slot & slot : llama.slots) {
json slot_data = llama.get_formated_generation(slot);
slot_data["id"] = slot.id;
slot_data["task_id"] = slot.task_id;
slot_data["state"] = slot.state;
slot_data["prompt"] = slot.prompt;
slot_data["next_token"] = {
{"has_next_token", slot.has_next_token},
{"n_remain", slot.n_remaining},
{"num_tokens_predicted", slot.n_decoded},
{"stopped_eos", slot.stopped_eos},
{"stopped_word", slot.stopped_word},
{"stopped_limit", slot.stopped_limit},
{"stopping_word", slot.stopping_word},
};
slots.push_back(slot_data);
}
res.set_content(slots.dump(), "application/json");
res.status = 200; // HTTP OK
});
}
svr.set_logger(log_server_request);
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
@@ -3128,8 +3206,25 @@ int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_3
));
llama.queue_tasks.start_loop();
shutdown_handler = [&](int) {
llama.queue_tasks.terminate();
};
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
llama.queue_tasks.start_loop();
svr.stop();
t.join();
llama_backend_free();
+17 -3
View File
@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
bool running;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
@@ -278,9 +279,18 @@ struct llama_server_queue {
queue_tasks_deferred.clear();
}
// Start the main loop. This call is blocking
[[noreturn]]
// end the start_loop routine
void terminate() {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
running = false;
}
condition_tasks.notify_all();
}
// Start the main loop.
void start_loop() {
running = true;
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
@@ -324,8 +334,12 @@ struct llama_server_queue {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
if (!running) {
LOG_VERBOSE("ending start_loop", {});
return;
}
condition_tasks.wait(lock, [&]{
return !queue_tasks.empty();
return (!queue_tasks.empty() || !running);
});
}
}
+18 -4
View File
@@ -4027,7 +4027,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
y4 += 32 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4170,7 +4173,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
y4 += 32 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4306,7 +4312,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
y4 += 32 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4424,7 +4433,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
y4 += 16 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4659,6 +4671,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
const float dl = d * sc[0];
const float ml = min * sc[1];
#else
(void) get_scale_min_k4_just2;
q = q + 16 * (il&1);
device const uint8_t * s = xb->scales;
device const half2 * dh = (device const half2 *)xb->d;
+41 -41
View File
@@ -1837,9 +1837,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
float sigma2 = sumx2/QK_K;
for (int j = 0; j < QK_K/16; ++j) {
const float * restrict qw = quant_weights + QK_K * i + 16*j;
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
}
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
@@ -3855,7 +3855,7 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -3866,8 +3866,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_0 * restrict x = vx;
@@ -4024,15 +4024,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
__m128i bx = _mm_and_si128(lowMask, tmp);
__m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
bx = _mm_sub_epi8(bx, off);
const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
bx_0 = _mm_sub_epi8(bx_0, off);
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
bx = _mm_sub_epi8(bx, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
bx_0 = _mm_sub_epi8(bx_0, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
// Convert int32_t to float
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
@@ -4222,7 +4222,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_1;
const int nb = n / qk;
@@ -4233,8 +4233,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_1 * restrict x = vx;
@@ -4440,7 +4440,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -4448,8 +4448,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
assert(qk == QK5_0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_0 * restrict x = vx;
@@ -4618,21 +4618,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
/* Compute combined scale for the block */
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
__m128i bxhil = _mm256_castsi256_si128(bxhi);
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
bxhil = _mm_andnot_si128(bxhil, mask);
bxhih = _mm_andnot_si128(bxhih, mask);
__m128i bxl = _mm256_castsi256_si128(bx);
__m128i bxh = _mm256_extractf128_si256(bx, 1);
__m128i bxl = _mm256_castsi256_si128(bx_0);
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
bxl = _mm_or_si128(bxl, bxhil);
bxh = _mm_or_si128(bxh, bxhih);
bx = MM256_SET_M128I(bxh, bxl);
bx_0 = MM256_SET_M128I(bxh, bxl);
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
/* Multiply q with scale and accumulate */
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
@@ -4731,7 +4731,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_1;
const int nb = n / qk;
@@ -4739,8 +4739,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
assert(qk == QK5_1);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_1 * restrict x = vx;
@@ -4925,22 +4925,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
__m128i bxhil = _mm256_castsi256_si128(bxhi);
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
bxhil = _mm_and_si128(bxhil, mask);
bxhih = _mm_and_si128(bxhih, mask);
__m128i bxl = _mm256_castsi256_si128(bx);
__m128i bxh = _mm256_extractf128_si256(bx, 1);
__m128i bxl = _mm256_castsi256_si128(bx_0);
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
bxl = _mm_or_si128(bxl, bxhil);
bxh = _mm_or_si128(bxh, bxhih);
bx = MM256_SET_M128I(bxh, bxl);
bx_0 = MM256_SET_M128I(bxh, bxl);
const __m256 dy = _mm256_set1_ps(y[i].d);
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_us8_pairs_float(bx, by);
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
}
@@ -5035,7 +5035,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -5046,8 +5046,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q8_0 * restrict x = vx;
@@ -5169,10 +5169,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
for (int i = 0; i < nb; i++) {
// load elements
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
+1 -1
View File
@@ -1,6 +1,6 @@
#!/bin/bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
echo "Usage:"
echo ""