Compare commits

...

24 Commits

Author SHA1 Message Date
Georgi Gerganov 68a6b98b3c make : fix CUDA build (#5580) 2024-02-19 13:41:51 +02:00
valiray 70d45af0ef readme : fix typo in README-sycl.md (#5353) 2024-02-19 12:37:10 +02:00
Abhilash Majumder 13e2c771aa cmake : remove obsolete sycl compile flags (#5581)
* rm unwanted sycl compile options

* fix bug

* fix bug

* format fix
2024-02-19 11:15:18 +02:00
Georgi Gerganov f53119cec4 minor : fix trailing whitespace (#5538) 2024-02-19 10:34:10 +02:00
Daniel Bevenius 7084755396 llava : avoid changing the original BakLLaVA model (#5577)
This is a follup of Commit fc0c8d286a
("llava : update surgery script to not remove tensors") but this time
the change is to the BakLLaVA specific part of the surgery script.

I've been able to test this using SkunkworksAI/BakLLaVA-1 and it works
as expected using the instructions in README.md.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
2024-02-19 10:31:59 +02:00
NawafAlansari 4480542b22 baby-llama : allocate graphs in ggml_context (#5573)
* Fixed the baby-llama issue (see issue #4830)

* minor : fix whitespaces

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-19 10:25:38 +02:00
Xuan Son Nguyen 11b12de39b llama : add llama_chat_apply_template() (#5538)
* llama: add llama_chat_apply_template

* test-chat-template: remove dedundant vector

* chat_template: do not use std::string for buffer

* add clarification for llama_chat_apply_template

* llama_chat_apply_template: add zephyr template

* llama_chat_apply_template: correct docs

* llama_chat_apply_template: use term "chat" everywhere

* llama_chat_apply_template: change variable name to "tmpl"
2024-02-19 10:23:37 +02:00
slaren 3a9cb4ca64 cuda, metal : fix nans in soft_max (#5574)
* cuda : fix nans in soft_max

* metal : fix nans in soft_max

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-02-19 10:04:45 +02:00
Mirko185 769a716e30 readme : update (#5572)
Added 1.5-bit on README.md
2024-02-19 09:39:31 +02:00
bmwl f0d1fafc02 ggml : android and old glibc NUMA incompatibility bugfixes (#5557)
* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
2024-02-19 09:38:32 +02:00
Jared Van Bortel a0c2dad9d4 build : pass all warning flags to nvcc via -Xcompiler (#5570)
* build : pass all warning flags to nvcc via -Xcompiler
* make : fix apparent mis-merge from #3952
* make : fix incorrect GF_CC_VER for CUDA host compiler
2024-02-18 16:21:52 -05:00
Georgi Gerganov 14278f55d2 ggml : restore vec dot stride arg names (#5453) 2024-02-18 22:58:57 +02:00
Georgi Gerganov b1de96824b ci : fix wikitext url + compile warnings (#5569)
ggml-ci
2024-02-18 22:39:30 +02:00
Georgi Gerganov 7ad554f90e metal : fix unused warnings (#0) 2024-02-18 21:39:58 +02:00
Robey Holderith 5ee99c32f5 common, server : surface min_keep as its own parameter (#5567)
* Feature - surface min_keep as its own parameter

* Updated README with min_keep param
2024-02-18 21:11:16 +02:00
Pierrick Hymbert c145f8a132 server : slots monitoring endpoint (#5550) 2024-02-18 19:39:57 +02:00
Georgi Gerganov 689a091bbe sampling : do not set min_keep to n_probs (#5564) 2024-02-18 19:38:06 +02:00
Georgi Gerganov f3f28c5395 cmake : fix GGML_USE_SYCL typo (#5555) 2024-02-18 19:17:00 +02:00
Pierrick Hymbert e75c6279d1 server : enhanced health endpoint (#5548)
* server: enrich health endpoint with available slots, return 503 if not slots are available

* server: document new status no slot available in the README.md
2024-02-18 18:31:28 +02:00
Pierrick Hymbert 36376abe05 server : --n-predict option document and cap to max value (#5549)
* server: document --n-predict

* server: ensure client request cannot override n_predict if set

* server: fix print usage LF in new --n-predict option
2024-02-18 18:30:09 +02:00
Daniel Hiltgen 66c1968f7a server : graceful server shutdown (#5244)
This updates the server queue to support graceful shutdown of the server on signals.
2024-02-18 18:23:16 +02:00
Georgi Gerganov 1dcc3fde00 common : fix ub (#5530) 2024-02-18 18:21:52 +02:00
Herman Semenov 5d3de51f97 ggml, common, examples, tests : fixed type arguments in printf (#5528) 2024-02-18 18:20:12 +02:00
Daniel Bevenius fc0c8d286a llava : update surgery script to not remove tensors (#5536)
This commit updates the surgery script to not remove the tensors from the
model file. For this to work the `--skip-unknown` flag is added as an
argument to the convert.py script in README.md.

The motivation for this change is that the surgery script currently
removes the projector tensors from the model file. If the model was
checked out from a repository, the model file will have been updated
and have to be checked out again to reset this effect. If this can be
avoided I think it would be preferable.

I did not perform this change for BakLLaVA models as I am not sure
how that part works.
2024-02-18 18:19:23 +02:00
32 changed files with 566 additions and 183 deletions
+7 -13
View File
@@ -526,7 +526,7 @@ if (LLAMA_SYCL)
message(STATUS "SYCL found")
add_compile_definitions(GML_USE_SYCL)
add_compile_definitions(GGML_USE_SYCL)
if (LLAMA_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
@@ -741,13 +741,6 @@ function(get_flags CCID CCVER)
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi)
endif()
elseif (CCID MATCHES "Intel")
if (NOT LLAMA_SYCL)
# enable max optimization level when using Intel compiler
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
add_link_options(-fuse-ld=lld -static-intel)
endif()
endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
@@ -778,10 +771,7 @@ endif()
set(CUDA_CXX_FLAGS "")
if (LLAMA_CUBLAS)
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
if (NOT MSVC)
list(APPEND CUDA_FLAGS -Wno-pedantic)
endif()
set(CUDA_FLAGS -use_fast_math)
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
@@ -814,7 +804,11 @@ if (LLAMA_CUBLAS)
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
get_flags(${CUDA_CCID} ${CUDA_CCVER})
list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
endif()
if (NOT MSVC)
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
endif()
endif()
+10 -29
View File
@@ -97,9 +97,10 @@ endif
#
# keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11
# -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST
@@ -220,30 +221,6 @@ ifeq ($(LLAMA_FATAL_WARNINGS),1)
MK_CXXFLAGS += -Werror
endif
ifeq ($(CC_IS_CLANG), 1)
# clang options
MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return
MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
MK_CFLAGS += -Wdouble-promotion
endif
ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
MK_CFLAGS += -Wdouble-promotion
endif
else
# gcc options
MK_CFLAGS += -Wdouble-promotion
MK_HOST_CXXFLAGS += -Wno-array-bounds
ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
MK_HOST_CXXFLAGS += -Wno-format-truncation
endif
ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
MK_HOST_CXXFLAGS += -Wextra-semi
endif
endif
# this version of Apple ld64 is buggy
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -468,7 +445,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
ifdef JETSON_EOL_MODULE_DETECT
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
else
$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endif # JETSON_EOL_MODULE_DETECT
endif # LLAMA_CUBLAS
@@ -579,7 +556,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
ifdef LLAMA_CUBLAS
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
endif
#
@@ -891,3 +868,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+1 -1
View File
@@ -272,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
+2 -2
View File
@@ -61,7 +61,7 @@ variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan, SYCL, and (partial) OpenCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
@@ -768,7 +768,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
#### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
```
+2 -2
View File
@@ -219,7 +219,7 @@ function gg_run_open_llama_3b_v2 {
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
@@ -401,7 +401,7 @@ function gg_run_open_llama_7b_v2 {
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2"
+5 -3
View File
@@ -1704,6 +1704,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@@ -1741,7 +1742,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
@@ -1750,7 +1751,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
@@ -1801,7 +1802,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
if (cs_curr[j] < 0) { continue; }
if (seqs.find(cs_curr[j]) == seqs.end()) {
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
seqs[cs_curr[j]] = seqs.size();
const size_t sz = seqs.size();
seqs[cs_curr[j]] = sz;
}
}
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
+2 -2
View File
@@ -121,7 +121,7 @@ static void sampler_queue(
struct llama_context * ctx_main,
const llama_sampling_params & params,
llama_token_data_array & cur_p,
size_t & min_keep) {
size_t min_keep) {
const float temp = params.temp;
const float dynatemp_range = params.dynatemp_range;
const float dynatemp_exponent = params.dynatemp_exponent;
@@ -249,7 +249,7 @@ static llama_token llama_sampling_sample_impl(
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else {
// temperature sampling
size_t min_keep = std::max(1, params.n_probs);
size_t min_keep = std::max(1, params.min_keep);
sampler_queue(ctx_main, params, cur_p, min_keep);
+1
View File
@@ -22,6 +22,7 @@ enum class llama_sampler_type : char {
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
+12 -10
View File
@@ -1533,16 +1533,17 @@ int main(int argc, char ** argv) {
int n_past = 0;
ggml_cgraph gf = {};
struct ggml_cgraph * gf = NULL;
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
// struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
ggml_build_forward_expand(&gf, e);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
float error_before_opt = ggml_get_f32_1d(e, 0);
@@ -1552,8 +1553,8 @@ int main(int argc, char ** argv) {
opt_params_lbfgs.lbfgs.n_iter = 16;
ggml_opt(ctx0, opt_params_lbfgs, e);
//
ggml_build_forward_expand(&gf, e);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
float error_after_opt = ggml_get_f32_1d(e, 0);
@@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) {
};
struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph gf = {};
struct ggml_cgraph * gf = NULL;
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
int n_past = 0;
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
ggml_build_forward_expand(&gf, logits);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
ggml_build_forward_expand(gf, logits);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
+1 -1
View File
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+1 -1
View File
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
@@ -325,14 +325,14 @@ struct train_params {
};
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd);
printf("%s: n_mult: %d\n", __func__, params->n_mult);
printf("%s: n_head: %d\n", __func__, params->n_head);
printf("%s: n_ff: %d\n", __func__, params->n_ff);
printf("%s: n_layer: %d\n", __func__, params->n_layer);
printf("%s: n_rot: %d\n", __func__, params->n_rot);
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_mult: %u\n", __func__, params->n_mult);
printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_ff: %u\n", __func__, params->n_ff);
printf("%s: n_layer: %u\n", __func__, params->n_layer);
printf("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void init_model(struct my_llama_model * model) {
@@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
model->train_tokens = 0;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
// printing the per-layer allocations here so we dont print in the for loop.
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
ggml_set_name(model->norm, "norm.weight");
+1 -1
View File
@@ -53,7 +53,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./convert.py ../llava-v1.5-7b
python ./convert.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
-8
View File
@@ -19,19 +19,12 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project
projector = {name: checkpoint[name].float() for name in mm_tensors}
torch.save(projector, f"{args.model}/llava.projector")
# remove these tensors from the checkpoint and save it again
for name in mm_tensors:
del checkpoint[name]
# BakLLaVA models contain CLIP tensors in it
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
if len(clip_tensors) > 0:
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
torch.save(clip, f"{args.model}/llava.clip")
# remove these tensors
for name in clip_tensors:
del checkpoint[name]
# added tokens should be removed to be able to convert Mistral models
if os.path.exists(f"{args.model}/added_tokens.json"):
@@ -39,7 +32,6 @@ if len(clip_tensors) > 0:
f.write("{}\n")
torch.save(checkpoint, path)
print("Done!")
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+3 -3
View File
@@ -309,7 +309,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
}
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
@@ -447,7 +447,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
return perplexity_v2(ctx, params);
}
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
@@ -1623,7 +1623,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
}
+68
View File
@@ -39,6 +39,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
## Build
@@ -135,6 +137,7 @@ node index.js
- `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
@@ -196,6 +199,8 @@ node index.js
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
@@ -379,6 +384,69 @@ Notice that each `probs` is an array of length `n_probs`.
}'
```
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
### Result JSON
```json
[
{
"dynatemp_exponent": 1.0,
"dynatemp_range": 0.0,
"frequency_penalty": 0.0,
"grammar": "",
"id": 0,
"ignore_eos": false,
"logit_bias": [],
"min_p": 0.05000000074505806,
"mirostat": 0,
"mirostat_eta": 0.10000000149011612,
"mirostat_tau": 5.0,
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
"n_ctx": 2048,
"n_keep": 0,
"n_predict": 100000,
"n_probs": 0,
"next_token": {
"has_next_token": true,
"n_remain": -1,
"num_tokens_predicted": 0,
"stopped_eos": false,
"stopped_limit": false,
"stopped_word": false,
"stopping_word": ""
},
"penalize_nl": true,
"penalty_prompt_tokens": [],
"presence_penalty": 0.0,
"prompt": "Say hello to llama.cpp",
"repeat_last_n": 64,
"repeat_penalty": 1.100000023841858,
"samplers": [
"top_k",
"tfs_z",
"typical_p",
"top_p",
"min_p",
"temperature"
],
"seed": 42,
"state": 1,
"stop": [
"\n"
],
"stream": false,
"task_id": 0,
"temperature": 0.0,
"tfs_z": 1.0,
"top_k": 40,
"top_p": 0.949999988079071,
"typical_p": 1.0,
"use_penalty_prompt_tokens": false
}
]
```
## More examples
### Change system prompt on runtime
+4
View File
@@ -234,6 +234,7 @@
mirostat_eta: 0.1, // learning rate
grammar: '',
n_probs: 0, // no completion_probabilities,
min_keep: 0, // min probs from each sampler,
image_data: [],
cache_prompt: true,
api_key: ''
@@ -791,6 +792,9 @@
<fieldset>
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
</fieldset>
<fieldset>
${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
</fieldset>
<fieldset>
<label for="api_key">API Key</label>
<input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+99 -4
View File
@@ -28,6 +28,7 @@
#include <chrono>
#include <condition_variable>
#include <atomic>
#include <signal.h>
using json = nlohmann::json;
@@ -40,6 +41,7 @@ struct server_params
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool slots_endpoint = true;
};
bool server_verbose = false;
@@ -158,6 +160,7 @@ struct llama_client_slot
int32_t n_decoded = 0;
int32_t n_remaining = -1;
int32_t i_batch = -1;
int32_t n_predict = -1;
int32_t num_prompt_tokens = 0;
int32_t num_prompt_tokens_processed = 0;
@@ -409,6 +412,7 @@ struct llama_server_context
slot.id = i;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
@@ -544,6 +548,16 @@ struct llama_server_context
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
{"params.n_predict", slot->params.n_predict},
{"slot.n_predict", slot->n_predict},
});
slot->params.n_predict = slot->n_predict;
}
// infill
if (data.count("input_prefix") != 0)
@@ -1052,6 +1066,7 @@ struct llama_server_context
return json {
{"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict},
{"model", params.model_alias},
{"seed", slot.params.seed},
{"temperature", slot.sparams.temp},
@@ -1079,6 +1094,7 @@ struct llama_server_context
{"stream", slot.params.stream},
{"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence}
};
@@ -1913,14 +1929,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
printf("\n");
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf(" --chat-template FORMAT_NAME");
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf("\n");
}
@@ -2360,6 +2378,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
}
else if (arg == "--slots-endpoint-disable")
{
sparams.slots_endpoint = false;
}
else if (arg == "--chat-template")
{
if (++i >= argc)
@@ -2511,6 +2533,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
std::function<void(int)> shutdown_handler;
inline void signal_handler(int signal) { shutdown_handler(signal); }
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -2561,8 +2586,35 @@ int main(int argc, char **argv)
server_state current_state = state.load();
switch(current_state) {
case SERVER_STATE_READY:
res.set_content(R"({"status": "ok"})", "application/json");
res.status = 200; // HTTP OK
if (llama.all_slots_are_idle) {
res.set_content(R"({"status": "ok"})", "application/json");
res.status = 200; // HTTP OK
} else {
int available_slots = 0;
int processing_slots = 0;
for (llama_client_slot & slot : llama.slots) {
if (slot.available()) {
available_slots++;
} else {
processing_slots++;
}
}
if (available_slots > 0) {
json health = {
{"status", "ok"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 200; // HTTP OK
} else {
json health = {
{"status", "no slot available"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 503; // HTTP Service Unavailable
}
}
break;
case SERVER_STATE_LOADING_MODEL:
res.set_content(R"({"status": "loading model"})", "application/json");
@@ -2575,6 +2627,32 @@ int main(int argc, char **argv)
}
});
if (sparams.slots_endpoint) {
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
json slots;
for (llama_client_slot & slot : llama.slots) {
json slot_data = llama.get_formated_generation(slot);
slot_data["id"] = slot.id;
slot_data["task_id"] = slot.task_id;
slot_data["state"] = slot.state;
slot_data["prompt"] = slot.prompt;
slot_data["next_token"] = {
{"has_next_token", slot.has_next_token},
{"n_remain", slot.n_remaining},
{"num_tokens_predicted", slot.n_decoded},
{"stopped_eos", slot.stopped_eos},
{"stopped_word", slot.stopped_word},
{"stopped_limit", slot.stopped_limit},
{"stopping_word", slot.stopping_word},
};
slots.push_back(slot_data);
}
res.set_content(slots.dump(), "application/json");
res.status = 200; // HTTP OK
});
}
svr.set_logger(log_server_request);
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
@@ -3128,8 +3206,25 @@ int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_3
));
llama.queue_tasks.start_loop();
shutdown_handler = [&](int) {
llama.queue_tasks.terminate();
};
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
llama.queue_tasks.start_loop();
svr.stop();
t.join();
llama_backend_free();
+17 -3
View File
@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
bool running;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
@@ -278,9 +279,18 @@ struct llama_server_queue {
queue_tasks_deferred.clear();
}
// Start the main loop. This call is blocking
[[noreturn]]
// end the start_loop routine
void terminate() {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
running = false;
}
condition_tasks.notify_all();
}
// Start the main loop.
void start_loop() {
running = true;
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
@@ -324,8 +334,12 @@ struct llama_server_queue {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
if (!running) {
LOG_VERBOSE("ending start_loop", {});
return;
}
condition_tasks.wait(lock, [&]{
return !queue_tasks.empty();
return (!queue_tasks.empty() || !running);
});
}
}
@@ -111,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd);
printf("%s: n_head: %d\n", __func__, params->n_head);
printf("%s: n_ff: %d\n", __func__, params->n_ff);
printf("%s: n_layer: %d\n", __func__, params->n_layer);
printf("%s: n_rot: %d\n", __func__, params->n_rot);
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_ff: %u\n", __func__, params->n_ff);
printf("%s: n_layer: %u\n", __func__, params->n_layer);
printf("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void set_param_model(struct my_llama_model * model) {
+4 -4
View File
@@ -6205,7 +6205,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col;
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + slope*pos[col];
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
vals[col] = val;
max_val = max(max_val, val);
@@ -9170,17 +9170,17 @@ static void ggml_cuda_op_soft_max(
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// positions tensor
float * src2_dd = dst_dd; // default to avoid null checks in the kernel
float * src2_dd = nullptr;
cuda_pool_alloc<float> src2_f;
ggml_tensor * src2 = dst->src[2];
const bool use_src2 = src2 != nullptr;
if (use_src2) {
const bool src2_on_device = use_src2 && src2->backend == GGML_BACKEND_GPU;
ggml_tensor_extra_gpu * src2_extra = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
if (src2_on_device) {
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
} else {
src2_dd = src2_f.alloc(ggml_nelements(src2));
+22 -8
View File
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
float lmax = -INFINITY;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
}
// find the max value in the block
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
// parallel sum
float lsum = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
lsum += exp_psrc0;
pdst[i00] = exp_psrc0;
}
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
float4 lmax4 = -INFINITY;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
}
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}
@@ -4027,7 +4027,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
y4 += 32 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4170,7 +4173,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
y4 += 32 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4306,7 +4312,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
y4 += 32 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4424,7 +4433,10 @@ void kernel_mul_mv_iq1_s_f32_impl(
y4 += 16 * 32;
}
#else
// TODO
(void) x;
(void) y;
(void) yl;
(void) nb32;
#endif
for (int row = 0; row < N_DST; ++row) {
@@ -4659,6 +4671,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
const float dl = d * sc[0];
const float ml = min * sc[1];
#else
(void) get_scale_min_k4_just2;
q = q + 16 * (il&1);
device const uint8_t * s = xb->scales;
device const half2 * dh = (device const half2 *)xb->d;
+41 -41
View File
@@ -1837,9 +1837,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
float sigma2 = sumx2/QK_K;
for (int j = 0; j < QK_K/16; ++j) {
const float * restrict qw = quant_weights + QK_K * i + 16*j;
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
}
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
@@ -3855,7 +3855,7 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -3866,8 +3866,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_0 * restrict x = vx;
@@ -4024,15 +4024,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
__m128i bx = _mm_and_si128(lowMask, tmp);
__m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
bx = _mm_sub_epi8(bx, off);
const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
bx_0 = _mm_sub_epi8(bx_0, off);
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
bx = _mm_sub_epi8(bx, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
bx_0 = _mm_sub_epi8(bx_0, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
// Convert int32_t to float
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
@@ -4222,7 +4222,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_1;
const int nb = n / qk;
@@ -4233,8 +4233,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_1 * restrict x = vx;
@@ -4440,7 +4440,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -4448,8 +4448,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
assert(qk == QK5_0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_0 * restrict x = vx;
@@ -4618,21 +4618,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
/* Compute combined scale for the block */
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
__m128i bxhil = _mm256_castsi256_si128(bxhi);
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
bxhil = _mm_andnot_si128(bxhil, mask);
bxhih = _mm_andnot_si128(bxhih, mask);
__m128i bxl = _mm256_castsi256_si128(bx);
__m128i bxh = _mm256_extractf128_si256(bx, 1);
__m128i bxl = _mm256_castsi256_si128(bx_0);
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
bxl = _mm_or_si128(bxl, bxhil);
bxh = _mm_or_si128(bxh, bxhih);
bx = MM256_SET_M128I(bxh, bxl);
bx_0 = MM256_SET_M128I(bxh, bxl);
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
/* Multiply q with scale and accumulate */
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
@@ -4731,7 +4731,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_1;
const int nb = n / qk;
@@ -4739,8 +4739,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
assert(qk == QK5_1);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q5_1 * restrict x = vx;
@@ -4925,22 +4925,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
__m128i bxhil = _mm256_castsi256_si128(bxhi);
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
bxhil = _mm_and_si128(bxhil, mask);
bxhih = _mm_and_si128(bxhih, mask);
__m128i bxl = _mm256_castsi256_si128(bx);
__m128i bxh = _mm256_extractf128_si256(bx, 1);
__m128i bxl = _mm256_castsi256_si128(bx_0);
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
bxl = _mm_or_si128(bxl, bxhil);
bxh = _mm_or_si128(bxh, bxhih);
bx = MM256_SET_M128I(bxh, bxl);
bx_0 = MM256_SET_M128I(bxh, bxl);
const __m256 dy = _mm256_set1_ps(y[i].d);
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_us8_pairs_float(bx, by);
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
}
@@ -5035,7 +5035,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bbx, const void * restrict vy, size_t bby, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -5046,8 +5046,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
assert(nrc == 1);
#endif
UNUSED(nrc);
UNUSED(bbx);
UNUSED(bby);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q8_0 * restrict x = vx;
@@ -5169,10 +5169,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
for (int i = 0; i < nb; i++) {
// load elements
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
+16 -7
View File
@@ -23,6 +23,9 @@
#include <limits.h>
#include <stdarg.h>
#include <signal.h>
#if defined(__gnu_linux__)
#include <syscall.h>
#endif
#ifdef GGML_USE_METAL
#include <unistd.h>
@@ -1971,7 +1974,7 @@ struct ggml_numa_nodes {
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
uint32_t current_node; // node on which main process is execting
#ifdef __linux__
#if defined(__gnu_linux__)
cpu_set_t cpuset; // cpuset from numactl
#else
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
@@ -2009,7 +2012,7 @@ inline static void ggml_critical_section_end(void) {
atomic_fetch_sub(&g_state_barrier, 1);
}
#ifdef __linux__
#if defined(__gnu_linux__)
static cpu_set_t ggml_get_numa_affinity(void) {
cpu_set_t cpuset;
pthread_t thread;
@@ -2031,7 +2034,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
return;
}
#ifdef __linux__
#if defined(__gnu_linux__)
struct stat st;
char path[256];
int rv;
@@ -2063,7 +2066,13 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
// figure out which node we're on
uint current_cpu;
int getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
int getcpu_ret = 0;
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
#else
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
#endif
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
g_state.numa.n_nodes = 0;
@@ -16734,7 +16743,7 @@ typedef pthread_t ggml_thread_t;
#endif
// Android's libc implementation "bionic" does not support setting affinity
#if defined(__linux__) && !defined(__BIONIC__)
#if defined(__gnu_linux__)
static void set_numa_thread_affinity(int thread_n) {
if (!ggml_is_numa()) {
return;
@@ -17909,7 +17918,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
ptr += ggml_nbytes(tensor);
fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
}
}
@@ -18012,7 +18021,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
result->nodes[i] = tensor;
fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
}
}
}
+117
View File
@@ -12508,6 +12508,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
return 0;
}
// trim whitespace from the beginning and end of a string
static std::string trim(const std::string & str) {
size_t start = 0;
size_t end = str.size();
while (start < end && isspace(str[start])) {
start += 1;
}
while (end > start && isspace(str[end - 1])) {
end -= 1;
}
return str.substr(start, end - start);
}
// Simple version of "llama_apply_chat_template" that only works with strings
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
static int32_t llama_chat_apply_template_internal(
const std::string & tmpl,
const std::vector<const llama_chat_message *> & chat,
std::string & dest, bool add_ass) {
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
std::stringstream ss;
if (tmpl.find("<|im_start|>") != std::string::npos) {
// chatml template
for (auto message : chat) {
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
}
if (add_ass) {
ss << "<|im_start|>assistant\n";
}
} else if (tmpl.find("[INST]") != std::string::npos) {
// llama2 template and its variants
// [variant] support system message
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
// [variant] space before + after response
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
// [variant] add BOS inside history
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
// [variant] trim spaces from the input message
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
// construct the prompt
bool is_inside_turn = true; // skip BOS at the beginning
ss << "[INST] ";
for (auto message : chat) {
std::string content = strip_message ? trim(message->content) : message->content;
std::string role(message->role);
if (!is_inside_turn) {
is_inside_turn = true;
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
}
if (role == "system") {
if (support_system_message) {
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
} else {
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
ss << content << "\n";
}
} else if (role == "user") {
ss << content << " [/INST]";
} else {
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
is_inside_turn = false;
}
}
// llama2 templates seem to not care about "add_generation_prompt"
} else if (tmpl.find("<|user|>") != std::string::npos) {
// zephyr template
for (auto message : chat) {
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
}
if (add_ass) {
ss << "<|assistant|>\n";
}
} else {
// template not supported
return -1;
}
dest = ss.str();
return dest.size();
}
LLAMA_API int32_t llama_chat_apply_template(
const struct llama_model * model,
const char * tmpl,
const struct llama_chat_message * chat,
size_t n_msg,
bool add_ass,
char * buf,
int32_t length) {
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
if (tmpl == nullptr) {
GGML_ASSERT(model != nullptr);
// load template from model
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
std::string template_key = "tokenizer.chat_template";
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
if (res < 0) {
// worst case: there is no information about template, we will use chatml by default
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
} else {
curr_tmpl = std::string(model_template.data(), model_template.size());
}
}
// format the chat to string
std::vector<const llama_chat_message *> chat_vec;
chat_vec.resize(n_msg);
for (size_t i = 0; i < n_msg; i++) {
chat_vec[i] = &chat[i];
}
std::string formatted_chat;
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
if (res < 0) {
return res;
}
strncpy(buf, formatted_chat.c_str(), length);
return res;
}
struct llama_timings llama_get_timings(struct llama_context * ctx) {
struct llama_timings result = {
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
+25
View File
@@ -305,6 +305,12 @@ extern "C" {
int32_t n_eval;
};
// used in chat template
typedef struct llama_chat_message {
const char * role;
const char * content;
} llama_chat_message;
// Helpers for getting default parameters
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -699,6 +705,25 @@ extern "C" {
char * buf,
int32_t length);
/// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the models default chat template will be used instead.
/// @param chat Pointer to a list of multiple llama_chat_message
/// @param n_msg Number of llama_chat_message in this chat
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
/// @param length The size of the allocated buffer
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
LLAMA_API int32_t llama_chat_apply_template(
const struct llama_model * model,
const char * tmpl,
const struct llama_chat_message * chat,
size_t n_msg,
bool add_ass,
char * buf,
int32_t length);
//
// Grammar
//
+1 -1
View File
@@ -1,6 +1,6 @@
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
GF_CC_IS_GCC = 1
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
else
GF_CC_IS_CLANG = 1
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
+1 -1
View File
@@ -1,6 +1,6 @@
#!/bin/bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
echo "Usage:"
echo ""
+1
View File
@@ -28,6 +28,7 @@ endfunction()
llama_build_and_test_executable(test-quantize-fns.cpp)
llama_build_and_test_executable(test-quantize-perf.cpp)
llama_build_and_test_executable(test-sampling.cpp)
llama_build_and_test_executable(test-chat-template.cpp)
llama_build_executable(test-tokenizer-0-llama.cpp)
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+64
View File
@@ -0,0 +1,64 @@
#include <iostream>
#include <string>
#include <vector>
#include <sstream>
#undef NDEBUG
#include <cassert>
#include "llama.h"
int main(void) {
llama_chat_message conversation[] = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "Who are you"},
{"assistant", " I am an assistant "},
{"user", "Another question"},
};
size_t message_count = 6;
std::vector<std::string> templates = {
// teknium/OpenHermes-2.5-Mistral-7B
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
// mistralai/Mistral-7B-Instruct-v0.2
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
// TheBloke/FusionNet_34Bx2_MoE-AWQ
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
// bofenghuang/vigogne-2-70b-chat
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
};
std::vector<std::string> expected_substr = {
"<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
"[/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
"</s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
"[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
};
std::vector<char> formatted_chat(1024);
int32_t res;
// test invalid chat template
res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
assert(res < 0);
for (size_t i = 0; i < templates.size(); i++) {
std::string custom_template = templates[i];
std::string substr = expected_substr[i];
formatted_chat.resize(1024);
res = llama_chat_apply_template(
nullptr,
custom_template.c_str(),
conversation,
message_count,
true,
formatted_chat.data(),
formatted_chat.size()
);
formatted_chat.resize(res);
std::string output(formatted_chat.data(), formatted_chat.size());
std::cout << output << "\n-------------------------\n";
// expect the "formatted_chat" to contain pre-defined strings
assert(output.find(substr) != std::string::npos);
}
return 0;
}
+10 -10
View File
@@ -38,8 +38,8 @@ term ::= [0-9]+)""";
// pretty print error message before asserting
if (expected_pair.first != key || expected_pair.second != value)
{
fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
fprintf(stderr, "expected_pair != actual_pair\n");
}
@@ -96,9 +96,9 @@ term ::= [0-9]+)""";
// pretty print error message before asserting
if (expected_element.type != element.type || expected_element.value != element.value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
fprintf(stderr, "index: %u\n", index);
fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
fprintf(stderr, "expected_element != actual_element\n");
}
@@ -144,8 +144,8 @@ term ::= [0-9]+)""";
// pretty print error message before asserting
if (expected_pair.first != key || expected_pair.second != value)
{
fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
fprintf(stderr, "expected_pair != actual_pair\n");
}
@@ -235,9 +235,9 @@ term ::= [0-9]+)""";
// pretty print error message before asserting
if (expected_element.type != element.type || expected_element.value != element.value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
fprintf(stderr, "index: %u\n", index);
fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
fprintf(stderr, "expected_element != actual_element\n");
}
+2 -2
View File
@@ -180,8 +180,8 @@ int main()
if (expected_element.type != element->type || expected_element.value != element->value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
fprintf(stderr, "expected_element != actual_element\n");
}