Compare commits

...

6 Commits

Author SHA1 Message Date
olexiyb f8ec8877b7 ci : fix macos x86 build (#7940)
In order to use old `macos-latest` we should use `macos-12`

Potentially will fix: https://github.com/ggerganov/llama.cpp/issues/6975
2024-06-14 20:28:34 +03:00
Johannes Gäßler 76d66ee0be CUDA: faster q2_K, q3_K MMQ + int8 tensor cores (#7921)
* CUDA: faster q2_K, q3_K MMQ + int8 tensor cores

* try CI fix

* try CI fix

* try CI fix

* fix data race

* rever q2_K precision related changes
2024-06-14 18:41:49 +02:00
Georgi Gerganov 66ef1ceedf metal : utilize max shared memory for mul_mat_id (#7935) 2024-06-14 17:14:09 +03:00
Radoslav Gerganov e65bbf606c llama-bench : fix RPC indication (#7936)
Show "<backend_name>+RPC" when RPC offloading is used
2024-06-14 16:47:41 +03:00
Sigbjørn Skjæret 6fcd1331ef llama : more checks before assuming FIM tokens (#7644)
* More checks before assuming FIM tokens for Llama arch

* extensive token check
2024-06-14 13:20:04 +03:00
Elaine 41b9260f18 convert : add Poro-34B-chat tokenizer support (#7713)
* support for Poro chat pre-tokenizer

* add support for Poro pre-tokenizer

* Update convert-hf-to-gguf-update.py

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Change Poro-34B-chat to poro-chat

* Change Poro-34B-chat to poro-chat

* Update convert-hf-to-gguf-update.py

* Update llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-06-14 13:16:49 +03:00
13 changed files with 529 additions and 367 deletions
+1 -1
View File
@@ -84,7 +84,7 @@ jobs:
name: llama-bin-macos-arm64.zip
macOS-latest-cmake-x64:
runs-on: macos-latest
runs-on: macos-12
steps:
- name: Clone
+1
View File
@@ -83,6 +83,7 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
]
+3
View File
@@ -477,6 +477,9 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
res = "poro-chat"
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
+6 -6
View File
@@ -714,7 +714,6 @@ struct test {
static const bool kompute;
static const bool metal;
static const bool sycl;
static const bool rpc;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
@@ -726,6 +725,7 @@ struct test {
int n_batch;
int n_ubatch;
int n_threads;
bool has_rpc;
ggml_type type_k;
ggml_type type_v;
int n_gpu_layers;
@@ -751,6 +751,7 @@ struct test {
n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads;
has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k;
type_v = inst.type_v;
n_gpu_layers = inst.n_gpu_layers;
@@ -810,9 +811,6 @@ struct test {
if (sycl) {
return GGML_SYCL_NAME;
}
if (rpc) {
return "RPC";
}
if (gpu_blas) {
return "GPU BLAS";
}
@@ -882,7 +880,7 @@ struct test {
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
@@ -916,7 +914,6 @@ const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
const bool test::rpc = !!ggml_cpu_has_rpc();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
@@ -1182,6 +1179,9 @@ struct markdown_printer : public printer {
value = buf;
} else if (field == "backend") {
value = test::get_backend();
if (t.has_rpc) {
value += "+RPC";
}
} else if (field == "test") {
if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
+4 -2
View File
@@ -188,13 +188,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
info.devices[id].nsm = prop.multiProcessorCount;
info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpb = prop.sharedMemPerBlock;
info.devices[id].nsm = prop.multiProcessorCount;
}
for (int id = 0; id < info.device_count; ++id) {
+1
View File
@@ -73,6 +73,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
const dim3 block_nums(1, nrows, 1);
const size_t shared_mem = ncols_pad * sizeof(int);
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
if (order == GGML_SORT_ORDER_ASC) {
+5
View File
@@ -331,6 +331,10 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
#define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
@@ -661,6 +665,7 @@ struct ggml_cuda_device_info {
int cc; // compute capability
int nsm; // number of streaming multiprocessors
size_t smpb; // max. shared memory per block
size_t smpbo; // max. shared memory per block (with opt-in)
bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory
size_t total_vram;
+441 -309
View File
File diff suppressed because it is too large Load Diff
+1
View File
@@ -130,6 +130,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
switch (ncols_x) {
case 32:
+16 -19
View File
@@ -265,36 +265,31 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
// contiguous u/y values
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
const half2 & dm2, const float & d8) {
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int sumi_d = 0;
int sumi_m = 0;
float sumf_d = 0.0f;
float sumf_m = 0.0f;
#pragma unroll
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
int sumi_d_sc = 0;
const int sc = scales[i0 / (QI8_1/2)];
// fill int with 4x m
int m = sc >> 4;
m |= m << 8;
m |= m << 16;
const float2 dm2f = __half22float2(dm2[i0/(QI8_1/2)]);
int sumi_d = 0;
int sumi_m = 0;
const int vi0 = v[i0/(QI8_1/2)];
#pragma unroll
for (int i = i0; i < i0 + QI8_1/2; ++i) {
sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
const int vi = (vi0 >> (2*(i % (QI8_1/2)))) & 0x03030303;
sumi_d = __dp4a(vi, u[i], sumi_d); // SIMD dot product
sumi_m = __dp4a(0x01010101, u[i], sumi_m);
}
sumi_d += sumi_d_sc * (sc & 0xF);
sumf_d += dm2f.x * sumi_d;
sumf_m += dm2f.y * sumi_m;
}
const float2 dm2f = __half22float2(dm2);
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
return d8*(sumf_d - sumf_m);
#else
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -352,8 +347,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
int sumi_sc = 0;
#pragma unroll
for (int i = i0; i < i0 + QI8_1/2; ++i) {
sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
const int vi = __vsubss4((v[i/2] >> (4*(i%2))) & 0x0F0F0F0F, 0x04040404);
sumi_sc = __dp4a(vi, u[i], sumi_sc); // SIMD dot product
}
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+2 -1
View File
@@ -1862,9 +1862,10 @@ static enum ggml_status ggml_metal_graph_compute(
// ne21 = n_rows
const int dst_rows = ne20*ne21;
const int dst_rows_min = n_as;
const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;
// max size of the rowids array in the kernel shared buffer
GGML_ASSERT(dst_rows <= 2048);
GGML_ASSERT(dst_rows <= dst_rows_max);
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+47 -29
View File
@@ -4561,35 +4561,6 @@ static void llm_load_vocab(
vocab.special_cls_id = -1;
vocab.special_mask_id = -1;
// For Fill-In-the-Middle (FIM)/infill models which where converted
// prior to support of FIM special tokens in GGUF, the following
// will allow those models to continue to work. The general names
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
// new versions of these models have been published.
std::string gen_name;
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
[](unsigned char c){ return std::tolower(c); });
if (gen_name.find("code") != std::string::npos) {
if (model.arch == LLM_ARCH_LLAMA) {
vocab.special_prefix_id = 32007;
vocab.special_suffix_id = 32008;
vocab.special_middle_id = 32009;
vocab.special_eot_id = 32010;
} else if (model.arch == LLM_ARCH_GEMMA) {
vocab.special_prefix_id = 67;
vocab.special_suffix_id = 69;
vocab.special_middle_id = 68;
// TODO: this is not EOT, it is "file separator" token, needs fix
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
//vocab.special_eot_id = 70;
vocab.special_eot_id = 107;
}
}
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) {
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4713,6 +4684,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "smaug-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
} else if (
tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@@ -4770,6 +4744,45 @@ static void llm_load_vocab(
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
// For Fill-In-the-Middle (FIM)/infill models which where converted
// prior to support of FIM special tokens in GGUF, the following
// will allow those models to continue to work. The general names
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
// new versions of these models have been published.
std::string gen_name;
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
[](unsigned char c){ return std::tolower(c); });
if (gen_name.find("code") != std::string::npos) {
if (model.arch == LLM_ARCH_LLAMA
&& 32010 < vocab.id_to_token.size()
&& vocab.id_to_token[32007].text == "<PRE>"
&& vocab.id_to_token[32008].text == "<SUF>"
&& vocab.id_to_token[32009].text == "<MID>"
&& vocab.id_to_token[32010].text == "<EOT>") {
vocab.special_prefix_id = 32007;
vocab.special_suffix_id = 32008;
vocab.special_middle_id = 32009;
vocab.special_eot_id = 32010;
} else if (model.arch == LLM_ARCH_GEMMA
&& 107 < vocab.id_to_token.size()
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
&& vocab.id_to_token[68].text == "<|fim_middle|>"
&& vocab.id_to_token[107].text == "<end_of_turn>") {
vocab.special_prefix_id = 67;
vocab.special_suffix_id = 69;
vocab.special_middle_id = 68;
// TODO: this is not EOT, it is "file separator" token, needs fix
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
//vocab.special_eot_id = 70;
vocab.special_eot_id = 107;
}
}
try {
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
} catch (const std::exception & e) {
@@ -13028,6 +13041,11 @@ struct llm_tokenizer_bpe {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
});
break;
case LLAMA_VOCAB_PRE_TYPE_PORO:
word_collection = unicode_regex_split(text, {
" ?[^(\\s|.,!?…。,、।۔،)]+",
});
break;
default:
// default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, {
+1
View File
@@ -86,6 +86,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
};
// note: these values should be synchronized with ggml_rope