mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 18:47:39 +02:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4cd621c26d | |||
| 7e0b6a7b3b | |||
| acdce3cdef | |||
| 3855416027 | |||
| c0e6fbf8c3 | |||
| c780e75305 | |||
| 48b2f9c1fc | |||
| af0a5b6163 | |||
| b6aa670203 | |||
| 260b7c6529 | |||
| 53d6c52e22 | |||
| 3af34c1d1b | |||
| 04976db7a8 | |||
| 947d3ad27d | |||
| 858f6b73f6 | |||
| b3a995b416 | |||
| bcdee0daa7 | |||
| 628b299106 | |||
| 8f8acc8683 | |||
| ca36326020 | |||
| 889bdd7686 | |||
| 6fbd432211 |
@@ -1,4 +1,17 @@
|
||||
[flake8]
|
||||
max-line-length = 125
|
||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
||||
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py
|
||||
exclude =
|
||||
# Do not traverse examples
|
||||
examples,
|
||||
# Do not include package initializers
|
||||
__init__.py,
|
||||
# No need to traverse our git directory
|
||||
.git,
|
||||
# There's no value in checking cache directories
|
||||
__pycache__,
|
||||
# No need to include the build path
|
||||
build,
|
||||
# This contains builds that we don't want to check
|
||||
dist # This is generated with `python build .` for package releases
|
||||
# max-complexity = 10
|
||||
|
||||
@@ -52,7 +52,19 @@ jobs:
|
||||
ftype: q4_0
|
||||
pr_comment_enabled: "true"
|
||||
|
||||
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
|
||||
if: |
|
||||
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|
||||
|| (
|
||||
github.event_name == 'schedule'
|
||||
&& github.ref_name == 'master'
|
||||
&& github.repository_owner == 'ggerganov'
|
||||
)
|
||||
|| github.event_name == 'pull_request_target'
|
||||
|| (
|
||||
github.event_name == 'push'
|
||||
&& github.event.ref == 'refs/heads/master'
|
||||
&& github.repository_owner == 'ggerganov'
|
||||
)
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
||||
+10
-1
@@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
||||
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
|
||||
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
@@ -409,6 +411,9 @@ if (LLAMA_CUDA)
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
if (LLAMA_CUDA_NO_VMM)
|
||||
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||
endif()
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||
@@ -434,7 +439,11 @@ if (LLAMA_CUDA)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
|
||||
if (LLAMA_CUDA_NO_VMM)
|
||||
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# 52 == lowest CUDA 12 standard
|
||||
|
||||
@@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
### Hot topics
|
||||
|
||||
- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
|
||||
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
|
||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
||||
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
|
||||
@@ -712,6 +713,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||
|
||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||
|
||||
```bash
|
||||
# obtain the official LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
@@ -933,17 +936,25 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||
|
||||
### Android
|
||||
|
||||
#### Build on Android using Termux
|
||||
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
|
||||
```
|
||||
apt update && apt upgrade -y
|
||||
apt install git make cmake
|
||||
```
|
||||
|
||||
It's recommended to move your model inside the `~/` directory for best performance:
|
||||
```
|
||||
cd storage/downloads
|
||||
mv model.gguf ~/
|
||||
```
|
||||
|
||||
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
|
||||
|
||||
#### Building the Project using Android NDK
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
||||
|
||||
First, install the essential packages for termux:
|
||||
```
|
||||
pkg install clang wget git cmake
|
||||
```
|
||||
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
||||
|
||||
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
|
||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
||||
|
||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
||||
```
|
||||
$ mkdir build-android
|
||||
$ cd build-android
|
||||
@@ -951,7 +962,9 @@ $ export NDK=<your_ndk_directory>
|
||||
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
$ make
|
||||
```
|
||||
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
|
||||
|
||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
||||
|
||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
||||
|
||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
||||
@@ -973,25 +986,10 @@ $cd /data/data/com.termux/files/home/bin
|
||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||
```
|
||||
|
||||
Here is a demo of an interactive session running on Pixel 5 phone:
|
||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||
|
||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||
|
||||
#### Build on Android using Termux
|
||||
[Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required).
|
||||
```
|
||||
apt update && apt upgrade -y
|
||||
apt install git
|
||||
```
|
||||
|
||||
It's recommended to move your model inside the `~/` directory for best performance:
|
||||
```
|
||||
cd storage/downloads
|
||||
mv model.gguf ~/
|
||||
```
|
||||
|
||||
[Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
|
||||
|
||||
### Docker
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
@@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {
|
||||
|
||||
set -e
|
||||
|
||||
# TODO: too slow, run on dedicated node
|
||||
#(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run embd_bge_small
|
||||
|
||||
test $ret -eq 0 && gg_run test_scripts_debug
|
||||
test $ret -eq 0 && gg_run test_scripts_release
|
||||
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
||||
test $ret -eq 0 && gg_run test_scripts_debug
|
||||
test $ret -eq 0 && gg_run test_scripts_release
|
||||
fi
|
||||
|
||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||
|
||||
@@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||
|
||||
result->prev.resize(params.n_prev);
|
||||
|
||||
result->n_considered = 0;
|
||||
|
||||
llama_sampling_set_rng_seed(result, params.seed);
|
||||
|
||||
return result;
|
||||
@@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||
|
||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||
ctx->cur.clear();
|
||||
ctx->n_considered = 0;
|
||||
}
|
||||
|
||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
||||
@@ -253,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
|
||||
}
|
||||
}
|
||||
|
||||
ctx_sampling->n_considered = cur_p.size;
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ struct llama_sampling_context {
|
||||
// TODO: replace with ring-buffer
|
||||
std::vector<llama_token> prev;
|
||||
std::vector<llama_token_data> cur;
|
||||
size_t n_considered;
|
||||
|
||||
std::mt19937 rng;
|
||||
};
|
||||
|
||||
Regular → Executable
+15
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
||||
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
||||
#
|
||||
@@ -64,6 +66,9 @@ models = [
|
||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
@@ -104,6 +109,14 @@ for model in models:
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# if downloaded file is less than 1KB, we likely need to download an LFS instead
|
||||
if os.path.getsize(save_path) < 1024:
|
||||
# remove the file
|
||||
os.remove(save_path)
|
||||
url = f"{repo}/resolve/main/tokenizer.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
url = f"{repo}/resolve/main/tokenizer.model"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
||||
@@ -139,6 +152,8 @@ for model in models:
|
||||
# print the "pre_tokenizer" content from the tokenizer.json
|
||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
normalizer = cfg["normalizer"]
|
||||
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
||||
pre_tokenizer = cfg["pre_tokenizer"]
|
||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||
|
||||
|
||||
+12
-2
@@ -311,6 +311,15 @@ class Model(ABC):
|
||||
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
|
||||
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
|
||||
res = "refact"
|
||||
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
|
||||
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
|
||||
res = "command-r"
|
||||
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
||||
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
||||
res = "olmo"
|
||||
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
||||
# ref: https://huggingface.co/databricks/dbrx-instruct
|
||||
res = "dbrx"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2828,8 +2837,9 @@ class OlmoModel(Model):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
if "clip_qkv" in self.hparams is not None:
|
||||
self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
|
||||
clip_qkv = self.hparams.get("clip_qkv")
|
||||
if clip_qkv is not None:
|
||||
self.gguf_writer.add_clamp_kqv(clip_qkv)
|
||||
|
||||
# Same as super class, but permuting q_proj, k_proj
|
||||
# Copied from: LlamaModel
|
||||
|
||||
@@ -16,6 +16,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
+1
-1
@@ -23,7 +23,7 @@ Install BLIS:
|
||||
sudo make install
|
||||
```
|
||||
|
||||
We recommend using openmp since it's easier to modify the cores been used.
|
||||
We recommend using openmp since it's easier to modify the cores being used.
|
||||
|
||||
### llama.cpp compilation
|
||||
|
||||
|
||||
@@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
|
||||
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||
|
||||
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||
|
||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||
|
||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
||||
|
||||
|
||||
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
||||
|
||||
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
|
||||
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
|
||||
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
||||
} else if (a->type == GGML_TYPE_F32) {
|
||||
return ggml_add(ctx, a, b);
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
|
||||
struct Stats {
|
||||
std::vector<float> values;
|
||||
std::vector<int> counts;
|
||||
int ncall = 0;
|
||||
};
|
||||
|
||||
@@ -121,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
auto & e = m_stats[wname];
|
||||
|
||||
++e.ncall;
|
||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||
// using the following line, we can correct for that if needed by replacing the line above with:
|
||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0]*n_as, 0);
|
||||
e.counts.resize(src1->ne[0]*n_as, 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||
@@ -153,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[e_start + j] += x[j]*x[j];
|
||||
e.counts[e_start + j]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -170,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
auto& e = m_stats[wname];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
e.counts.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
@@ -183,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
e.counts[j]++;
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
@@ -222,7 +224,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
|
||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
||||
int nval = p.second.values.size();
|
||||
out.write((const char *) &nval, sizeof(nval));
|
||||
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
|
||||
if (nval > 0) {
|
||||
std::vector<float> tmp(nval);
|
||||
for (int i = 0; i < nval; i++) {
|
||||
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
||||
}
|
||||
out.write((const char*)tmp.data(), nval*sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
// Write the number of call the matrix was computed with
|
||||
@@ -270,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
|
||||
imatrix_data = {};
|
||||
return false;
|
||||
}
|
||||
e.values.resize(nval);
|
||||
in.read((char*)e.values.data(), nval*sizeof(float));
|
||||
|
||||
// When re-called from load_imatrix() with add set, this will already be created.
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(nval, 0);
|
||||
e.counts.resize(nval, 0);
|
||||
}
|
||||
|
||||
std::vector<float> tmp(nval);
|
||||
in.read((char*)tmp.data(), nval*sizeof(float));
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading data for entry %d\n",__func__,i);
|
||||
imatrix_data = {};
|
||||
return false;
|
||||
}
|
||||
e.ncall = ncall;
|
||||
|
||||
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
|
||||
for (int i = 0; i < nval; i++) {
|
||||
e.values[i] += tmp[i];
|
||||
e.counts[i] += ncall;
|
||||
}
|
||||
e.ncall += ncall;
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -178,6 +178,7 @@ struct cmd_params {
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<bool> use_mmap;
|
||||
std::vector<bool> embeddings;
|
||||
ggml_numa_strategy numa;
|
||||
int reps;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
@@ -200,6 +201,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN
|
||||
@@ -224,6 +226,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
@@ -396,6 +399,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
||||
} else if (arg == "--numa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
} else {
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
||||
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
} else if (arg == "-fa" || arg == "--flash-attn") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -1215,6 +1229,7 @@ int main(int argc, char ** argv) {
|
||||
llama_log_set(llama_null_log_callback, NULL);
|
||||
}
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// initialize printer
|
||||
std::unique_ptr<printer> p;
|
||||
|
||||
@@ -56,7 +56,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
|
||||
python ./convert.py ../llava-v1.5-7b --skip-unknown
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
|
||||
|
||||
## LLaVA 1.6 gguf conversion
|
||||
1) First clone a LLaVA 1.6 model:
|
||||
|
||||
@@ -143,7 +143,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
|
||||
|
||||
### Extended Context Size
|
||||
|
||||
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||
|
||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||
|
||||
@@ -286,7 +286,7 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||
|
||||
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
|
||||
- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
|
||||
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
|
||||
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
|
||||
|
||||
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
|
||||
|
||||
|
||||
@@ -796,7 +796,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// deal with end of generation tokens in interactive mode
|
||||
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
||||
LOG("found EOS token\n");
|
||||
LOG("found an EOG token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
if (!params.antiprompt.empty()) {
|
||||
|
||||
@@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
|
||||
@@ -62,6 +62,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
|
||||
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
|
||||
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
|
||||
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
|
||||
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
||||
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
|
||||
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
|
||||
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
|
||||
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
|
||||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||
|
||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||
@@ -260,7 +272,7 @@ node index.js
|
||||
|
||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
|
||||
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
|
||||
|
||||
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
|
||||
|
||||
|
||||
+24
-10
@@ -2266,17 +2266,31 @@ struct server_context {
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
result.tok = id;
|
||||
|
||||
const int32_t n_probs = slot.sparams.n_probs;
|
||||
if (slot.sparams.temp <= 0 && n_probs > 0) {
|
||||
// for llama_sample_token_greedy we need to sort candidates
|
||||
llama_sample_softmax(ctx, &cur_p);
|
||||
}
|
||||
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
||||
if (n_probs > 0) {
|
||||
const size_t n_considered = slot.ctx_sampling->n_considered;
|
||||
|
||||
for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
cur_p.data[i].p
|
||||
});
|
||||
// Make sure at least n_probs top tokens are at the front of the vector:
|
||||
if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
|
||||
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
||||
}
|
||||
|
||||
if (slot.sparams.temp == 0.0f) {
|
||||
// With greedy sampling the probabilities have possibly not been calculated.
|
||||
for (size_t i = 0; i < n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
i == 0 ? 1.0f : 0.0f
|
||||
});
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p.data[i].id,
|
||||
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# llama.cpp/example/sycl
|
||||
|
||||
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
|
||||
This example program provides the tools for llama.cpp for SYCL on Intel GPU.
|
||||
|
||||
## Tool
|
||||
|
||||
|
||||
Generated
+12
-18
@@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1712014858,
|
||||
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
|
||||
"lastModified": 1714641030,
|
||||
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
|
||||
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1714076141,
|
||||
"narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
|
||||
"lastModified": 1714635257,
|
||||
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
|
||||
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -36,20 +36,14 @@
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"dir": "lib",
|
||||
"lastModified": 1711703276,
|
||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
||||
"type": "github"
|
||||
"lastModified": 1714640452,
|
||||
"narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"dir": "lib",
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
|
||||
+3
-3
@@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
int device_vmm = 0;
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS)
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
CUdevice device;
|
||||
CU_CHECK(cuDeviceGet(&device, id));
|
||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||
@@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
};
|
||||
|
||||
// pool with virtual memory
|
||||
#if !defined(GGML_USE_HIPBLAS)
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
@@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
#endif // !defined(GGML_USE_HIPBLAS)
|
||||
|
||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||
#if !defined(GGML_USE_HIPBLAS)
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
if (ggml_cuda_info().devices[device].vmm) {
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||
}
|
||||
|
||||
+77
@@ -17,6 +17,83 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
||||
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
if (!(u.i & 0x7f800000)) { /* subnormal */
|
||||
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
+1
-1
@@ -803,7 +803,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
return op->ne[3] == 1;
|
||||
return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
|
||||
+1
-1
@@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
|
||||
const short D4 = D/4;
|
||||
const short D8 = D/8;
|
||||
const short Q8 = Q/8;
|
||||
//const short Q8 = Q/8;
|
||||
const short NW = N_SIMDWIDTH;
|
||||
const short SH = (C + Q); // shared memory per simdgroup in (half)
|
||||
|
||||
|
||||
@@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||
const size_t nb = nbytes/ggml_type_size(type);
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
int nans = 0;
|
||||
int infs = 0;
|
||||
const unsigned short * f = (const unsigned short *) data;
|
||||
for (size_t i = 0; i < nb; ++i) {
|
||||
nans += (f[i] & 0x7fff) > 0x7f80;
|
||||
infs += (f[i] & 0x7fff) == 0x7f80;
|
||||
}
|
||||
if (nans) {
|
||||
fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
|
||||
return false;
|
||||
}
|
||||
if (infs) {
|
||||
fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
|
||||
|
||||
@@ -326,14 +326,20 @@ extern "C" {
|
||||
// get ggml_status name string
|
||||
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
||||
|
||||
// ieee 754-2008 half-precision float16
|
||||
// todo: make this not an integral type
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
|
||||
|
||||
// convert FP16 <-> FP32
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
||||
// google brain half-precision bfloat16
|
||||
typedef struct { uint16_t bits; } ggml_bf16_t;
|
||||
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
||||
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
||||
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
||||
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
@@ -370,6 +376,7 @@ extern "C" {
|
||||
GGML_TYPE_I64 = 27,
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
@@ -410,6 +417,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
||||
@@ -817,6 +817,7 @@ class GGMLQuantizationType(IntEnum):
|
||||
I64 = 27
|
||||
F64 = 28
|
||||
IQ1_M = 29
|
||||
BF16 = 30
|
||||
|
||||
|
||||
class GGUFEndian(IntEnum):
|
||||
@@ -888,6 +889,7 @@ GGML_QUANT_SIZES = {
|
||||
GGMLQuantizationType.I64: (1, 8),
|
||||
GGMLQuantizationType.F64: (1, 8),
|
||||
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
||||
GGMLQuantizationType.BF16: (1, 2),
|
||||
}
|
||||
|
||||
|
||||
|
||||
+1
-1
@@ -51,7 +51,7 @@ single-line ::= [^\n]+ "\n"`
|
||||
|
||||
## Sequences and Alternatives
|
||||
|
||||
The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
|
||||
The order of symbols in a sequence matters. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
|
||||
|
||||
Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
|
||||
|
||||
|
||||
@@ -3175,6 +3175,7 @@ struct llama_model_loader {
|
||||
switch (type_max) {
|
||||
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
||||
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
||||
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
||||
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
||||
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
||||
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
||||
@@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
@@ -4386,6 +4388,15 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
||||
} else if (
|
||||
tokenizer_pre == "command-r") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
||||
} else if (
|
||||
tokenizer_pre == "olmo") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
||||
} else if (
|
||||
tokenizer_pre == "dbrx") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@@ -6123,6 +6134,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|| !(
|
||||
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
||||
)
|
||||
@@ -12191,6 +12203,7 @@ struct llm_tokenizer_bpe {
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
switch (vocab.type_pre) {
|
||||
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
@@ -12238,12 +12251,14 @@ struct llm_tokenizer_bpe {
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"\\p{N}",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
});
|
||||
@@ -14150,13 +14165,16 @@ static void llama_tensor_dequantize_internal(
|
||||
if (qtype.to_float == NULL) {
|
||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
||||
}
|
||||
} else if (tensor->type != GGML_TYPE_F16) {
|
||||
} else if (tensor->type != GGML_TYPE_F16 &&
|
||||
tensor->type != GGML_TYPE_BF16) {
|
||||
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
||||
}
|
||||
|
||||
if (nthread < 2) {
|
||||
if (tensor->type == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
||||
} else if (tensor->type == GGML_TYPE_BF16) {
|
||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
||||
} else if (ggml_is_quantized(tensor->type)) {
|
||||
qtype.to_float(tensor->data, f32_output, nelements);
|
||||
} else {
|
||||
@@ -14165,7 +14183,14 @@ static void llama_tensor_dequantize_internal(
|
||||
return;
|
||||
}
|
||||
|
||||
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
||||
size_t block_size;
|
||||
if (tensor->type == GGML_TYPE_F16 ||
|
||||
tensor->type == GGML_TYPE_BF16) {
|
||||
block_size = 1;
|
||||
} else {
|
||||
block_size = (size_t)ggml_blck_size(tensor->type);
|
||||
}
|
||||
|
||||
size_t block_size_bytes = ggml_type_size(tensor->type);
|
||||
|
||||
GGML_ASSERT(nelements % block_size == 0);
|
||||
@@ -14184,6 +14209,8 @@ static void llama_tensor_dequantize_internal(
|
||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||
if (typ == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||
} else if (typ == GGML_TYPE_BF16) {
|
||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
||||
} else {
|
||||
qtype.to_float(inbuf, outbuf, nels);
|
||||
}
|
||||
@@ -14544,6 +14571,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
||||
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
||||
|
||||
// K-quants
|
||||
|
||||
@@ -80,6 +80,9 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 11,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -135,6 +138,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1,106 @@
|
||||
ied 4 ½ months
|
||||
__ggml_vocab_test__
|
||||
Führer
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
Hello world
|
||||
__ggml_vocab_test__
|
||||
Hello world
|
||||
__ggml_vocab_test__
|
||||
Hello World
|
||||
__ggml_vocab_test__
|
||||
Hello World
|
||||
__ggml_vocab_test__
|
||||
Hello World!
|
||||
__ggml_vocab_test__
|
||||
Hello, world!
|
||||
__ggml_vocab_test__
|
||||
Hello, world!
|
||||
__ggml_vocab_test__
|
||||
this is 🦙.cpp
|
||||
__ggml_vocab_test__
|
||||
w048 7tuijk dsdfhu
|
||||
__ggml_vocab_test__
|
||||
нещо на Български
|
||||
__ggml_vocab_test__
|
||||
កាន់តែពិសេសអាចខលចេញ
|
||||
__ggml_vocab_test__
|
||||
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
(
|
||||
__ggml_vocab_test__
|
||||
|
||||
=
|
||||
__ggml_vocab_test__
|
||||
' era
|
||||
__ggml_vocab_test__
|
||||
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
||||
__ggml_vocab_test__
|
||||
3
|
||||
__ggml_vocab_test__
|
||||
33
|
||||
__ggml_vocab_test__
|
||||
333
|
||||
__ggml_vocab_test__
|
||||
3333
|
||||
__ggml_vocab_test__
|
||||
33333
|
||||
__ggml_vocab_test__
|
||||
333333
|
||||
__ggml_vocab_test__
|
||||
3333333
|
||||
__ggml_vocab_test__
|
||||
33333333
|
||||
__ggml_vocab_test__
|
||||
333333333
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
||||
__ggml_vocab_test__
|
||||
@@ -0,0 +1,43 @@
|
||||
2536 228 27 228 22957 6983
|
||||
45 193433
|
||||
|
||||
228
|
||||
1667
|
||||
1742
|
||||
205
|
||||
206
|
||||
2126
|
||||
11516
|
||||
34777
|
||||
28339 3845
|
||||
46609 3845
|
||||
28339 3930
|
||||
46609 3930
|
||||
46609 3930 8
|
||||
28339 19 3845 8
|
||||
46609 19 3845 8
|
||||
2075 1801 11254 107 255 21 19317
|
||||
94 23 27 31 228 30 21213 20752 39267 6405 9980
|
||||
4929 40071 2196 3236 8750 1764 37097 41168
|
||||
38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239
|
||||
2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16
|
||||
28339
|
||||
46609
|
||||
228 46609
|
||||
1667 46609
|
||||
1742 46609
|
||||
1742 46609 1856 46609
|
||||
1737
|
||||
206 1857
|
||||
14 4515
|
||||
28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372
|
||||
26
|
||||
26 26
|
||||
26 26 26
|
||||
26 26 26 26
|
||||
26 26 26 26 26
|
||||
26 26 26 26 26 26
|
||||
26 26 26 26 26 26 26
|
||||
26 26 26 26 26 26 26 26
|
||||
26 26 26 26 26 26 26 26 26
|
||||
127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51
|
||||
@@ -1,5 +1,5 @@
|
||||
numpy~=1.24.4
|
||||
sentencepiece~=0.1.98
|
||||
transformers>=4.35.2,<5.0.0
|
||||
transformers>=4.40.1,<5.0.0
|
||||
gguf>=0.1.0
|
||||
protobuf>=4.21.0,<5.0.0
|
||||
|
||||
@@ -93,11 +93,14 @@ help_s = (
|
||||
"specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
|
||||
)
|
||||
parser.add_argument("-s", "--show", help=help_s)
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
known_args, unknown_args = parser.parse_known_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
||||
|
||||
if unknown_args:
|
||||
logger.error(f"Received unknown args: {unknown_args}.")
|
||||
logger.error(f"Received unknown args: {unknown_args}.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -110,7 +113,7 @@ if input_file is None:
|
||||
input_file = sqlite_files[0]
|
||||
|
||||
if input_file is None:
|
||||
logger.error("Cannot find a suitable input file, please provide one.")
|
||||
logger.error("Cannot find a suitable input file, please provide one.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -202,12 +205,12 @@ elif repo is not None:
|
||||
hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
|
||||
|
||||
if hexsha8_baseline is None:
|
||||
logger.error("No baseline was provided and did not find data for any master branch commits.")
|
||||
logger.error("No baseline was provided and did not find data for any master branch commits.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.error("No baseline was provided and the current working directory "
|
||||
"is not part of a git repository from which a baseline could be inferred.")
|
||||
"is not part of a git repository from which a baseline could be inferred.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -238,7 +241,7 @@ elif repo is not None:
|
||||
break
|
||||
|
||||
if hexsha8_compare is None:
|
||||
logger.error("No compare target was provided and did not find data for any non-master commits.")
|
||||
logger.error("No compare target was provided and did not find data for any non-master commits.\n")
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
else:
|
||||
@@ -361,7 +364,7 @@ if "gpu_info" in show:
|
||||
headers = [PRETTY_NAMES[p] for p in show]
|
||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||
|
||||
logger.info(tabulate(
|
||||
print(tabulate( # noqa: NP100
|
||||
table,
|
||||
headers=headers,
|
||||
floatfmt=".2f",
|
||||
|
||||
@@ -41,20 +41,20 @@ def get_matches(regex_expr):
|
||||
|
||||
|
||||
def print_cat(cat, ranges):
|
||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
|
||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
||||
cnt = 0
|
||||
for start, end in ranges:
|
||||
if cnt % 4 != 0:
|
||||
print(" ", end="")
|
||||
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="")
|
||||
print(" ", end="") # noqa: NP100
|
||||
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
|
||||
if cnt % 4 == 3:
|
||||
print("")
|
||||
print("") # noqa: NP100
|
||||
cnt += 1
|
||||
|
||||
if cnt % 4 != 0:
|
||||
print("")
|
||||
print("};")
|
||||
print("")
|
||||
print("") # noqa: NP100
|
||||
print("};") # noqa: NP100
|
||||
print("") # noqa: NP100
|
||||
|
||||
|
||||
print_cat("number", get_matches(r'\p{N}'))
|
||||
|
||||
@@ -83,6 +83,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||
|
||||
# build test-tokenizer-1-bpe target once and add many tests
|
||||
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
||||
|
||||
@@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
|
||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
||||
@@ -92,6 +92,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
||||
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
||||
if (t->type == GGML_TYPE_F16) {
|
||||
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
||||
} else if (t->type == GGML_TYPE_BF16) {
|
||||
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
||||
} else if (t->type == GGML_TYPE_F32) {
|
||||
tv.push_back(*(float *) &buf[i]);
|
||||
} else if (t->type == GGML_TYPE_I32) {
|
||||
@@ -1898,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
std::default_random_engine rng(0);
|
||||
|
||||
const ggml_type all_types[] = {
|
||||
GGML_TYPE_F32, GGML_TYPE_F16,
|
||||
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
||||
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
||||
GGML_TYPE_Q8_0,
|
||||
|
||||
@@ -13,7 +13,7 @@ fname_tok = args.fname_tok
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||
|
||||
print('tokenizing file: ', fname_tok)
|
||||
print('tokenizing file: ', fname_tok) # noqa: NP100
|
||||
fname_out = fname_tok + '.tok'
|
||||
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
@@ -21,7 +21,7 @@ with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||
t_start = time.time()
|
||||
res = tokenizer.encode(s, add_special_tokens=False)
|
||||
t_end = time.time()
|
||||
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)')
|
||||
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
|
||||
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||
for x in res:
|
||||
# LLaMA v3 for some reason strips the space for these tokens (and others)
|
||||
@@ -41,6 +41,6 @@ with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||
# f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
|
||||
f.write(str(x) + '\n')
|
||||
print('len(res): ', len(res))
|
||||
print('len(lines): ', len(lines))
|
||||
print('results written to: ', fname_out)
|
||||
print('len(res): ', len(res)) # noqa: NP100
|
||||
print('len(lines): ', len(lines)) # noqa: NP100
|
||||
print('results written to: ', fname_out) # noqa: NP100
|
||||
|
||||
Reference in New Issue
Block a user