mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-17 11:07:39 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4cd621c26d | |||
| 7e0b6a7b3b | |||
| acdce3cdef | |||
| 3855416027 | |||
| c0e6fbf8c3 | |||
| c780e75305 | |||
| 48b2f9c1fc | |||
| af0a5b6163 | |||
| b6aa670203 | |||
| 260b7c6529 | |||
| 53d6c52e22 |
@@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
### Hot topics
|
||||
|
||||
- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
|
||||
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
|
||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
||||
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
|
||||
@@ -935,17 +936,25 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||
|
||||
### Android
|
||||
|
||||
#### Build on Android using Termux
|
||||
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
|
||||
```
|
||||
apt update && apt upgrade -y
|
||||
apt install git make cmake
|
||||
```
|
||||
|
||||
It's recommended to move your model inside the `~/` directory for best performance:
|
||||
```
|
||||
cd storage/downloads
|
||||
mv model.gguf ~/
|
||||
```
|
||||
|
||||
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
|
||||
|
||||
#### Building the Project using Android NDK
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
||||
|
||||
First, install the essential packages for termux:
|
||||
```
|
||||
pkg install clang wget git cmake
|
||||
```
|
||||
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
||||
|
||||
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
|
||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
||||
|
||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
||||
```
|
||||
$ mkdir build-android
|
||||
$ cd build-android
|
||||
@@ -953,7 +962,9 @@ $ export NDK=<your_ndk_directory>
|
||||
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
$ make
|
||||
```
|
||||
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
|
||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
||||
|
||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
||||
|
||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
||||
@@ -975,25 +986,10 @@ $cd /data/data/com.termux/files/home/bin
|
||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||
```
|
||||
|
||||
Here is a demo of an interactive session running on Pixel 5 phone:
|
||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||
|
||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||
|
||||
#### Build on Android using Termux
|
||||
[Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required).
|
||||
```
|
||||
apt update && apt upgrade -y
|
||||
apt install git
|
||||
```
|
||||
|
||||
It's recommended to move your model inside the `~/` directory for best performance:
|
||||
```
|
||||
cd storage/downloads
|
||||
mv model.gguf ~/
|
||||
```
|
||||
|
||||
[Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
|
||||
|
||||
### Docker
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
@@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||
|
||||
result->prev.resize(params.n_prev);
|
||||
|
||||
result->n_considered = 0;
|
||||
|
||||
llama_sampling_set_rng_seed(result, params.seed);
|
||||
|
||||
return result;
|
||||
@@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||
|
||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||
ctx->cur.clear();
|
||||
ctx->n_considered = 0;
|
||||
}
|
||||
|
||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
||||
@@ -253,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
|
||||
}
|
||||
}
|
||||
|
||||
ctx_sampling->n_considered = cur_p.size;
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ struct llama_sampling_context {
|
||||
// TODO: replace with ring-buffer
|
||||
std::vector<llama_token> prev;
|
||||
std::vector<llama_token_data> cur;
|
||||
size_t n_considered;
|
||||
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
@@ -67,6 +67,8 @@ models = [
|
||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
@@ -150,6 +152,8 @@ for model in models:
|
||||
# print the "pre_tokenizer" content from the tokenizer.json
|
||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
normalizer = cfg["normalizer"]
|
||||
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
||||
pre_tokenizer = cfg["pre_tokenizer"]
|
||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||
|
||||
|
||||
@@ -314,6 +314,12 @@ class Model(ABC):
|
||||
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
|
||||
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
|
||||
res = "command-r"
|
||||
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
||||
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
||||
res = "olmo"
|
||||
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
||||
# ref: https://huggingface.co/databricks/dbrx-instruct
|
||||
res = "dbrx"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2831,8 +2837,9 @@ class OlmoModel(Model):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
if "clip_qkv" in self.hparams is not None:
|
||||
self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
|
||||
clip_qkv = self.hparams.get("clip_qkv")
|
||||
if clip_qkv is not None:
|
||||
self.gguf_writer.add_clamp_kqv(clip_qkv)
|
||||
|
||||
# Same as super class, but permuting q_proj, k_proj
|
||||
# Copied from: LlamaModel
|
||||
|
||||
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
||||
|
||||
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
|
||||
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
|
||||
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
||||
} else if (a->type == GGML_TYPE_F32) {
|
||||
return ggml_add(ctx, a, b);
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
|
||||
struct Stats {
|
||||
std::vector<float> values;
|
||||
std::vector<int> counts;
|
||||
int ncall = 0;
|
||||
};
|
||||
|
||||
@@ -121,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
auto & e = m_stats[wname];
|
||||
|
||||
++e.ncall;
|
||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||
// using the following line, we can correct for that if needed by replacing the line above with:
|
||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0]*n_as, 0);
|
||||
e.counts.resize(src1->ne[0]*n_as, 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||
@@ -153,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[e_start + j] += x[j]*x[j];
|
||||
e.counts[e_start + j]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -170,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
auto& e = m_stats[wname];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
e.counts.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
@@ -183,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
e.counts[j]++;
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
@@ -222,7 +224,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
|
||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
||||
int nval = p.second.values.size();
|
||||
out.write((const char *) &nval, sizeof(nval));
|
||||
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
|
||||
if (nval > 0) {
|
||||
std::vector<float> tmp(nval);
|
||||
for (int i = 0; i < nval; i++) {
|
||||
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
||||
}
|
||||
out.write((const char*)tmp.data(), nval*sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
// Write the number of call the matrix was computed with
|
||||
@@ -270,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
|
||||
imatrix_data = {};
|
||||
return false;
|
||||
}
|
||||
e.values.resize(nval);
|
||||
in.read((char*)e.values.data(), nval*sizeof(float));
|
||||
|
||||
// When re-called from load_imatrix() with add set, this will already be created.
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(nval, 0);
|
||||
e.counts.resize(nval, 0);
|
||||
}
|
||||
|
||||
std::vector<float> tmp(nval);
|
||||
in.read((char*)tmp.data(), nval*sizeof(float));
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading data for entry %d\n",__func__,i);
|
||||
imatrix_data = {};
|
||||
return false;
|
||||
}
|
||||
e.ncall = ncall;
|
||||
|
||||
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
|
||||
for (int i = 0; i < nval; i++) {
|
||||
e.values[i] += tmp[i];
|
||||
e.counts[i] += ncall;
|
||||
}
|
||||
e.ncall += ncall;
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
|
||||
@@ -62,6 +62,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
|
||||
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
|
||||
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
|
||||
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
|
||||
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
||||
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
|
||||
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
|
||||
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
|
||||
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
|
||||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||
|
||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||
@@ -260,7 +272,7 @@ node index.js
|
||||
|
||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
|
||||
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
|
||||
|
||||
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
|
||||
|
||||
|
||||
+24
-10
@@ -2266,17 +2266,31 @@ struct server_context {
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
result.tok = id;
|
||||
|
||||
const int32_t n_probs = slot.sparams.n_probs;
|
||||
if (slot.sparams.temp <= 0 && n_probs > 0) {
|
||||
// for llama_sample_token_greedy we need to sort candidates
|
||||
llama_sample_softmax(ctx, &cur_p);
|
||||
}
|
||||
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
||||
if (n_probs > 0) {
|
||||
const size_t n_considered = slot.ctx_sampling->n_considered;
|
||||
|
||||
for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
cur_p.data[i].p
|
||||
});
|
||||
// Make sure at least n_probs top tokens are at the front of the vector:
|
||||
if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
|
||||
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
||||
}
|
||||
|
||||
if (slot.sparams.temp == 0.0f) {
|
||||
// With greedy sampling the probabilities have possibly not been calculated.
|
||||
for (size_t i = 0; i < n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
i == 0 ? 1.0f : 0.0f
|
||||
});
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
|
||||
+77
@@ -17,6 +17,83 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
||||
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
if (!(u.i & 0x7f800000)) { /* subnormal */
|
||||
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
+1
-1
@@ -803,7 +803,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
return op->ne[3] == 1;
|
||||
return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
|
||||
+1
-1
@@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
|
||||
const short D4 = D/4;
|
||||
const short D8 = D/8;
|
||||
const short Q8 = Q/8;
|
||||
//const short Q8 = Q/8;
|
||||
const short NW = N_SIMDWIDTH;
|
||||
const short SH = (C + Q); // shared memory per simdgroup in (half)
|
||||
|
||||
|
||||
@@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||
const size_t nb = nbytes/ggml_type_size(type);
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
int nans = 0;
|
||||
int infs = 0;
|
||||
const unsigned short * f = (const unsigned short *) data;
|
||||
for (size_t i = 0; i < nb; ++i) {
|
||||
nans += (f[i] & 0x7fff) > 0x7f80;
|
||||
infs += (f[i] & 0x7fff) == 0x7f80;
|
||||
}
|
||||
if (nans) {
|
||||
fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
|
||||
return false;
|
||||
}
|
||||
if (infs) {
|
||||
fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
|
||||
|
||||
@@ -326,14 +326,20 @@ extern "C" {
|
||||
// get ggml_status name string
|
||||
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
||||
|
||||
// ieee 754-2008 half-precision float16
|
||||
// todo: make this not an integral type
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
|
||||
|
||||
// convert FP16 <-> FP32
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
||||
// google brain half-precision bfloat16
|
||||
typedef struct { uint16_t bits; } ggml_bf16_t;
|
||||
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
||||
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
||||
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
||||
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
@@ -370,6 +376,7 @@ extern "C" {
|
||||
GGML_TYPE_I64 = 27,
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
@@ -410,6 +417,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
||||
@@ -817,6 +817,7 @@ class GGMLQuantizationType(IntEnum):
|
||||
I64 = 27
|
||||
F64 = 28
|
||||
IQ1_M = 29
|
||||
BF16 = 30
|
||||
|
||||
|
||||
class GGUFEndian(IntEnum):
|
||||
@@ -888,6 +889,7 @@ GGML_QUANT_SIZES = {
|
||||
GGMLQuantizationType.I64: (1, 8),
|
||||
GGMLQuantizationType.F64: (1, 8),
|
||||
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
||||
GGMLQuantizationType.BF16: (1, 2),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -3175,6 +3175,7 @@ struct llama_model_loader {
|
||||
switch (type_max) {
|
||||
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
||||
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
||||
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
||||
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
||||
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
||||
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
||||
@@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
@@ -4389,6 +4391,12 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "command-r") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
||||
} else if (
|
||||
tokenizer_pre == "olmo") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
||||
} else if (
|
||||
tokenizer_pre == "dbrx") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@@ -6126,6 +6134,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|| !(
|
||||
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
||||
)
|
||||
@@ -12194,6 +12203,7 @@ struct llm_tokenizer_bpe {
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
switch (vocab.type_pre) {
|
||||
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
@@ -12248,6 +12258,7 @@ struct llm_tokenizer_bpe {
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
});
|
||||
@@ -14154,13 +14165,16 @@ static void llama_tensor_dequantize_internal(
|
||||
if (qtype.to_float == NULL) {
|
||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
||||
}
|
||||
} else if (tensor->type != GGML_TYPE_F16) {
|
||||
} else if (tensor->type != GGML_TYPE_F16 &&
|
||||
tensor->type != GGML_TYPE_BF16) {
|
||||
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
||||
}
|
||||
|
||||
if (nthread < 2) {
|
||||
if (tensor->type == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
||||
} else if (tensor->type == GGML_TYPE_BF16) {
|
||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
||||
} else if (ggml_is_quantized(tensor->type)) {
|
||||
qtype.to_float(tensor->data, f32_output, nelements);
|
||||
} else {
|
||||
@@ -14169,7 +14183,14 @@ static void llama_tensor_dequantize_internal(
|
||||
return;
|
||||
}
|
||||
|
||||
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
||||
size_t block_size;
|
||||
if (tensor->type == GGML_TYPE_F16 ||
|
||||
tensor->type == GGML_TYPE_BF16) {
|
||||
block_size = 1;
|
||||
} else {
|
||||
block_size = (size_t)ggml_blck_size(tensor->type);
|
||||
}
|
||||
|
||||
size_t block_size_bytes = ggml_type_size(tensor->type);
|
||||
|
||||
GGML_ASSERT(nelements % block_size == 0);
|
||||
@@ -14188,6 +14209,8 @@ static void llama_tensor_dequantize_internal(
|
||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||
if (typ == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||
} else if (typ == GGML_TYPE_BF16) {
|
||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
||||
} else {
|
||||
qtype.to_float(inbuf, outbuf, nels);
|
||||
}
|
||||
@@ -14548,6 +14571,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
||||
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
||||
|
||||
// K-quants
|
||||
|
||||
@@ -81,6 +81,8 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 11,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -136,6 +138,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
@@ -93,11 +93,14 @@ help_s = (
|
||||
"specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
|
||||
)
|
||||
parser.add_argument("-s", "--show", help=help_s)
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
known_args, unknown_args = parser.parse_known_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
||||
|
||||
if unknown_args:
|
||||
logger.error(f"Received unknown args: {unknown_args}.")
|
||||
logger.error(f"Received unknown args: {unknown_args}.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -110,7 +113,7 @@ if input_file is None:
|
||||
input_file = sqlite_files[0]
|
||||
|
||||
if input_file is None:
|
||||
logger.error("Cannot find a suitable input file, please provide one.")
|
||||
logger.error("Cannot find a suitable input file, please provide one.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -202,12 +205,12 @@ elif repo is not None:
|
||||
hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
|
||||
|
||||
if hexsha8_baseline is None:
|
||||
logger.error("No baseline was provided and did not find data for any master branch commits.")
|
||||
logger.error("No baseline was provided and did not find data for any master branch commits.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.error("No baseline was provided and the current working directory "
|
||||
"is not part of a git repository from which a baseline could be inferred.")
|
||||
"is not part of a git repository from which a baseline could be inferred.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -238,7 +241,7 @@ elif repo is not None:
|
||||
break
|
||||
|
||||
if hexsha8_compare is None:
|
||||
logger.error("No compare target was provided and did not find data for any non-master commits.")
|
||||
logger.error("No compare target was provided and did not find data for any non-master commits.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
else:
|
||||
@@ -361,7 +364,7 @@ if "gpu_info" in show:
|
||||
headers = [PRETTY_NAMES[p] for p in show]
|
||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||
|
||||
logger.info(tabulate(
|
||||
print(tabulate( # noqa: NP100
|
||||
table,
|
||||
headers=headers,
|
||||
floatfmt=".2f",
|
||||
|
||||
@@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
||||
@@ -92,6 +92,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
||||
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
||||
if (t->type == GGML_TYPE_F16) {
|
||||
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
||||
} else if (t->type == GGML_TYPE_BF16) {
|
||||
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
||||
} else if (t->type == GGML_TYPE_F32) {
|
||||
tv.push_back(*(float *) &buf[i]);
|
||||
} else if (t->type == GGML_TYPE_I32) {
|
||||
@@ -1898,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
std::default_random_engine rng(0);
|
||||
|
||||
const ggml_type all_types[] = {
|
||||
GGML_TYPE_F32, GGML_TYPE_F16,
|
||||
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
||||
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
||||
GGML_TYPE_Q8_0,
|
||||
|
||||
Reference in New Issue
Block a user