Compare commits

...

9 Commits

Author SHA1 Message Date
Johannes Gäßler ad014bba97 make: add error message for bad CUDA version (#5444)
* make: add error message for bad CUDA version

* Update Makefile

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
2024-02-13 12:38:37 +01:00
Georgi Gerganov 49cc1f7d67 bert : add tests + fix quantization (#5475)
* llama : do not quantize pos embd and token type tensors

* ci : add BERT tests

ggml-ci

* ci : do not do BERT tests on low-perf nodes

ggml-ci
2024-02-13 13:01:29 +02:00
Georgi Gerganov 99b8b43d7b tests : disable moe test (#5473) 2024-02-13 11:20:24 +02:00
Kawrakow 895407f31b ggml-quants : fix compiler warnings (shadow variable) (#5472)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2024-02-13 09:07:57 +02:00
Georgi Gerganov 099afc6274 llama : fix quantization when tensors are missing (#5423) 2024-02-12 20:14:39 +02:00
Georgi Gerganov df334a1125 swift : package no longer use ggml dependency (#5465)
* Revert "swift : update Package.swift to use ggml as dependency (#4691)"

This reverts commit ece9a45e8f.

* spm : add ggml headers
2024-02-12 19:54:29 +02:00
Lee dbd8828eb0 py : fix persimmon n_rot conversion (#5460)
* convert : fix persimmon offical weight conversion to write correct n_rot.

* Update convert-persimmon-to-gguf.py

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-12 19:29:57 +02:00
Abhilash Majumder 43fe07c1a4 ggml-sycl: Replace 3d ops with macro (#5458)
* use macro

* use macro

* fix format
2024-02-12 20:22:05 +05:30
Daniel Bevenius 4a46d2b792 llava : remove prog parameter from ArgumentParser (#5457)
* llava: remove prog parameter from ArgumentParser

This commit removes the `prog` parameter from `ArgumentParser`
so that it uses the default value which is the name of the script.

The motivation for this change is that currently the usage output looks
like this:
```console
$ python examples/llava/convert-image-encoder-to-gguf.py --help
usage: convert_hf_to_gguf.py [-h] ...
```
And with this change it will look like this:
```console
$ python examples/llava/convert-image-encoder-to-gguf.py --help
usage: convert-image-encoder-to-gguf.py [-h] ...
```

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* ci: add W503 to flake8 ignore list

This commit adds W503 to the ignore list for flake8. This is done to
avoid the following error:
W503 line break before binary operator

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
2024-02-12 10:38:44 +02:00
13 changed files with 143 additions and 99 deletions
+1 -1
View File
@@ -16,5 +16,5 @@ jobs:
- name: flake8 Lint
uses: py-actions/flake8@v2
with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py"
+8
View File
@@ -569,6 +569,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef LLAMA_CUBLAS
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # LLAMA_CUBLAS
$(info )
+19 -5
View File
@@ -13,17 +13,31 @@ let package = Package(
products: [
.library(name: "llama", targets: ["llama"]),
],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
],
targets: [
.target(
name: "llama",
dependencies: ["ggml"],
path: ".",
exclude: ["ggml-metal.metal"],
exclude: [
"cmake",
"examples",
"scripts",
"models",
"tests",
"CMakeLists.txt",
"ggml-cuda.cu",
"ggml-cuda.h",
"Makefile"
],
sources: [
"ggml.c",
"llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers",
cSettings: [
+46
View File
@@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
}
# bge-small
function gg_run_embd_bge_small {
cd ${SRC}
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
path_models="../models-mnt/bge-small"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}
## main
if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2
+2 -1
View File
@@ -88,7 +88,8 @@ def main():
gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
gguf_writer.add_rope_dimension_count(hidden_size // head_count)
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
@@ -71,7 +71,7 @@ def bytes_to_unicode():
return dict(zip(bs, cs))
ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
ap.add_argument("--text-only", action="store_true", required=False,
+18 -18
View File
@@ -3819,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
/* Compute combined scale for the block */
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i qx = bytes_from_nibbles_32(x[i].qs);
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8( 8 );
bx = _mm256_sub_epi8( bx, off );
qx = _mm256_sub_epi8( qx, off );
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps( d, q, acc );
@@ -4196,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
const __m256i qx = bytes_from_nibbles_32(x[i].qs);
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
const __m256 xy = mul_sum_us8_pairs_float(bx, by);
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
// Accumulate d0*d1*x*y
#if defined(__AVX2__)
@@ -4418,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
/* Compute combined scale for the block */
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i qx = bytes_from_nibbles_32(x[i].qs);
__m256i bxhi = bytes_from_bits_32(x[i].qh);
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
bx = _mm256_or_si256(bx, bxhi);
qx = _mm256_or_si256(qx, bxhi);
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps(d, q, acc);
@@ -4722,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i qx = bytes_from_nibbles_32(x[i].qs);
__m256i bxhi = bytes_from_bits_32(x[i].qh);
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
bx = _mm256_or_si256(bx, bxhi);
qx = _mm256_or_si256(qx, bxhi);
const __m256 dy = _mm256_set1_ps(y[i].d);
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_us8_pairs_float(bx, by);
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
}
@@ -4973,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
for (int i = 0; i < nb; ++i) {
// Compute combined scale for the block
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
// Multiply q with scale and accumulate
#if defined(__AVX2__)
+14 -61
View File
@@ -11578,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
}
char * dst_ptr = (char *) dst;
const int64_t ne0 = src->ne[0];
const int64_t nb0 = src->nb[0];
const int64_t nb1 = src->nb[1];
const int64_t nb2 = src->nb[2];
const int64_t nb3 = src->nb[3];
GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
const enum ggml_type type = src->type;
const int64_t ts = ggml_type_size(type);
const int64_t bs = ggml_blck_size(type);
@@ -12426,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
@@ -12758,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
ggml_sycl_op_mul_mat_t op,
const bool convert_src1_to_q8_1) try {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t nrows1 = ggml_nrows(src1);
GGML_ASSERT(ne03 == ne13);
@@ -13337,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst);
@@ -13655,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
const int64_t ne01 = src00->ne[1];
const int64_t ne02 = src00->ne[2];
const int64_t ne03 = src00->ne[3];
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
//const int64_t nb01 = src00->nb[1];
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
//const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst);
@@ -13940,25 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nb00 = src0->nb[0];
const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2];
const int64_t nb03 = src0->nb[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t nb10 = src1->nb[0];
const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2];
const int64_t nb13 = src1->nb[3];
GGML_TENSOR_BINARY_OP_LOCALS;
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
+29 -9
View File
@@ -772,22 +772,37 @@ struct LLM_TN {
llm_arch arch;
std::string operator()(llm_tensor tensor) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
return "__missing__";
}
return LLM_TENSOR_NAMES[arch].at(tensor);
}
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
return "__missing__";
}
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
}
std::string operator()(llm_tensor tensor, int bid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
return "__missing__";
}
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
}
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
return "__missing__";
}
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
}
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
return "__missing__";
}
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
}
};
@@ -10227,6 +10242,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
}
++qs.i_ffn_up;
}
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
@@ -10286,19 +10302,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
@@ -10428,7 +10444,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= !params->only_copy;
// do not quantize expert gating tensors
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
// do not quantize positional embeddings and token types (BERT)
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
enum ggml_type new_type;
void * new_data;
+1
View File
@@ -0,0 +1 @@
../ggml-alloc.h
+1
View File
@@ -0,0 +1 @@
../ggml-backend.h
+1
View File
@@ -0,0 +1 @@
../ggml.h
+2 -3
View File
@@ -2129,14 +2129,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_leaky_relu());
// these tests are disabled to save execution time, but they can be handy for debugging
#if 0
#if !defined(__SANITIZE_THREAD__)
// FIXME: these tests use too much memory with thread sanitizer
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
#endif
// these tests are disabled to save execution time, but they can be handy for debugging
#if 0
test_cases.emplace_back(new test_llama(1));
test_cases.emplace_back(new test_llama(2));
test_cases.emplace_back(new test_falcon(1));