mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 10:46:43 +02:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c4e6dd59e4 | |||
| 037259be68 | |||
| 263978904c | |||
| cf45252a7c | |||
| 03bf161eb6 | |||
| ad014bba97 | |||
| 49cc1f7d67 |
@@ -569,6 +569,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
|
||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||
ifndef CUDA_DOCKER_ARCH
|
||||
ifndef CUDA_POWER_ARCH
|
||||
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
||||
endif # CUDA_POWER_ARCH
|
||||
endif # CUDA_DOCKER_ARCH
|
||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||
endif # LLAMA_CUBLAS
|
||||
$(info )
|
||||
|
||||
|
||||
@@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
|
||||
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||
}
|
||||
|
||||
# bge-small
|
||||
|
||||
function gg_run_embd_bge_small {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
|
||||
|
||||
path_models="../models-mnt/bge-small"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
|
||||
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_embd_bge_small {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'BGE Small (BERT):\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
}
|
||||
|
||||
## main
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
@@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
|
||||
test $ret -eq 0 && gg_run ctest_release
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run embd_bge_small
|
||||
|
||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||
|
||||
@@ -1648,6 +1648,7 @@ class BertModel(Model):
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
self.gguf_writer.add_pooling_layer(True)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def set_vocab(self):
|
||||
|
||||
@@ -7,6 +7,51 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static std::vector<std::string> split_lines(const std::string & s) {
|
||||
std::string line;
|
||||
std::vector<std::string> lines;
|
||||
std::stringstream ss(s);
|
||||
while (std::getline(ss, line)) {
|
||||
lines.push_back(line);
|
||||
}
|
||||
return lines;
|
||||
}
|
||||
|
||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
||||
for (size_t i = 0; i < tokens.size(); i++) {
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
|
||||
}
|
||||
}
|
||||
|
||||
static void normalize(float * vec, float * out, int n) {
|
||||
float norm = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
norm += vec[i] * vec[i];
|
||||
}
|
||||
norm = sqrt(norm);
|
||||
for (int i = 0; i < n; i++) {
|
||||
out[i] = vec[i] / norm;
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// run model
|
||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
fprintf(stderr, "%s : failed to decode\n", __func__);
|
||||
}
|
||||
|
||||
// normalize on copy
|
||||
for (int k = 0; k < n_seq; k++) {
|
||||
float * emb = llama_get_embeddings_ith(ctx, k);
|
||||
float * out = output + k * n_embd;
|
||||
normalize(emb, out, n_embd);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
@@ -55,59 +100,84 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
int n_past = 0;
|
||||
// split the prompt into lines
|
||||
std::vector<std::string> prompts = split_lines(params.prompt);
|
||||
|
||||
// tokenize the prompt
|
||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
// max batch size
|
||||
const uint64_t n_batch = params.n_batch;
|
||||
GGML_ASSERT(params.n_batch == params.n_ctx);
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
for (const auto & prompt : prompts) {
|
||||
auto inp = ::llama_tokenize(ctx, prompt, true);
|
||||
if (inp.size() > n_batch) {
|
||||
inp.resize(n_batch);
|
||||
}
|
||||
inputs.push_back(inp);
|
||||
}
|
||||
|
||||
// tokenization stats
|
||||
if (params.verbose_prompt) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
if (embd_inp.size() > (size_t)n_ctx) {
|
||||
fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
|
||||
__func__, embd_inp.size(), n_ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (!embd_inp.empty()) {
|
||||
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
|
||||
if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += n_tokens;
|
||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
|
||||
}
|
||||
// initialize batch
|
||||
const int n_prompts = prompts.size();
|
||||
struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
|
||||
|
||||
// allocate output
|
||||
const int n_embd = llama_n_embd(model);
|
||||
auto * embeddings = llama_get_embeddings(ctx);
|
||||
std::vector<float> embeddings(n_prompts * n_embd, 0);
|
||||
float * emb = embeddings.data();
|
||||
|
||||
// l2-normalize embeddings
|
||||
float norm = 0;
|
||||
for (int i = 0; i < n_embd; i++) {
|
||||
norm += embeddings[i] * embeddings[i];
|
||||
}
|
||||
norm = sqrt(norm);
|
||||
for (int i = 0; i < n_embd; i++) {
|
||||
embeddings[i] /= norm;
|
||||
// break into batches
|
||||
int p = 0; // number of prompts processed already
|
||||
int s = 0; // number of prompts in current batch
|
||||
for (int k = 0; k < n_prompts; k++) {
|
||||
// clamp to n_batch tokens
|
||||
auto & inp = inputs[k];
|
||||
const uint64_t n_toks = inp.size();
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
llama_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
}
|
||||
|
||||
// add to batch
|
||||
batch_add_seq(batch, inp, s);
|
||||
s += 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_embd; i++) {
|
||||
printf("%f ", embeddings[i]);
|
||||
}
|
||||
printf("\n");
|
||||
// final batch
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
|
||||
// print first 3 embeddings
|
||||
for (int j = 0; j < std::min(3, n_prompts); j++) {
|
||||
fprintf(stderr, "embedding %d: ", j);
|
||||
for (int i = 0; i < n_embd; i++) {
|
||||
fprintf(stderr, "%f ", emb[j * n_embd + i]);
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
// clean up
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these
|
||||
--rank-wk N LORA rank for wk tensor (default 4)
|
||||
--rank-wv N LORA rank for wv tensor (default 4)
|
||||
--rank-wo N LORA rank for wo tensor (default 4)
|
||||
--rank-w1 N LORA rank for w1 tensor (default 4)
|
||||
--rank-w2 N LORA rank for w2 tensor (default 4)
|
||||
--rank-w3 N LORA rank for w3 tensor (default 4)
|
||||
--rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
|
||||
--rank-ffn_down N LORA rank for ffn_down tensor (default 4)
|
||||
--rank-ffn_up N LORA rank for ffn_up tensor (default 4)
|
||||
```
|
||||
|
||||
The LORA rank of 'norm' tensors should always be 1.
|
||||
|
||||
+121
-121
@@ -60,9 +60,9 @@ struct my_llama_layer {
|
||||
struct ggml_tensor * ffn_norm;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * w1;
|
||||
struct ggml_tensor * w2;
|
||||
struct ggml_tensor * w3;
|
||||
struct ggml_tensor * ffn_gate; // w1
|
||||
struct ggml_tensor * ffn_down; // w2
|
||||
struct ggml_tensor * ffn_up; // w3
|
||||
};
|
||||
|
||||
struct my_llama_model {
|
||||
@@ -85,9 +85,9 @@ struct my_llama_lora_hparams {
|
||||
uint32_t n_rank_wv = 4;
|
||||
uint32_t n_rank_wo = 4;
|
||||
uint32_t n_rank_ffn_norm = 1;
|
||||
uint32_t n_rank_w1 = 4;
|
||||
uint32_t n_rank_w2 = 4;
|
||||
uint32_t n_rank_w3 = 4;
|
||||
uint32_t n_rank_ffn_gate = 4;
|
||||
uint32_t n_rank_ffn_down = 4;
|
||||
uint32_t n_rank_ffn_up = 4;
|
||||
uint32_t n_rank_tok_embeddings = 4;
|
||||
uint32_t n_rank_norm = 1;
|
||||
uint32_t n_rank_output = 4;
|
||||
@@ -117,12 +117,12 @@ struct my_llama_lora_layer {
|
||||
struct ggml_tensor * ffn_norm_b;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * w1_a;
|
||||
struct ggml_tensor * w1_b;
|
||||
struct ggml_tensor * w2_a;
|
||||
struct ggml_tensor * w2_b;
|
||||
struct ggml_tensor * w3_a;
|
||||
struct ggml_tensor * w3_b;
|
||||
struct ggml_tensor * ffn_gate_a;
|
||||
struct ggml_tensor * ffn_gate_b;
|
||||
struct ggml_tensor * ffn_down_a;
|
||||
struct ggml_tensor * ffn_down_b;
|
||||
struct ggml_tensor * ffn_up_a;
|
||||
struct ggml_tensor * ffn_up_b;
|
||||
};
|
||||
|
||||
struct my_llama_lora {
|
||||
@@ -208,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
|
||||
printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv);
|
||||
printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo);
|
||||
printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm);
|
||||
printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1);
|
||||
printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2);
|
||||
printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3);
|
||||
printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate);
|
||||
printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down);
|
||||
printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up);
|
||||
printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
|
||||
printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm);
|
||||
printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output);
|
||||
@@ -319,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
|
||||
layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
|
||||
layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
|
||||
layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
|
||||
layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
|
||||
layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||
layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
|
||||
layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
|
||||
layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||
layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
|
||||
|
||||
assert_shape_1d(layer.attention_norm, hparams.n_embd);
|
||||
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
|
||||
@@ -329,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
|
||||
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
|
||||
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
|
||||
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
|
||||
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
|
||||
assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd);
|
||||
assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff);
|
||||
assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff);
|
||||
assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd);
|
||||
assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -362,12 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) {
|
||||
ggml_set_param(ctx, layer.wo_b);
|
||||
ggml_set_param(ctx, layer.ffn_norm_a);
|
||||
ggml_set_param(ctx, layer.ffn_norm_b);
|
||||
ggml_set_param(ctx, layer.w1_a);
|
||||
ggml_set_param(ctx, layer.w1_b);
|
||||
ggml_set_param(ctx, layer.w2_a);
|
||||
ggml_set_param(ctx, layer.w2_b);
|
||||
ggml_set_param(ctx, layer.w3_a);
|
||||
ggml_set_param(ctx, layer.w3_b);
|
||||
ggml_set_param(ctx, layer.ffn_gate_a);
|
||||
ggml_set_param(ctx, layer.ffn_gate_b);
|
||||
ggml_set_param(ctx, layer.ffn_down_a);
|
||||
ggml_set_param(ctx, layer.ffn_down_b);
|
||||
ggml_set_param(ctx, layer.ffn_up_a);
|
||||
ggml_set_param(ctx, layer.ffn_up_b);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -435,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
|
||||
layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
|
||||
layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
|
||||
|
||||
layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
|
||||
layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
|
||||
layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
|
||||
layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
|
||||
layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
|
||||
layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
|
||||
layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
|
||||
layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
|
||||
layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
|
||||
layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
|
||||
layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd);
|
||||
layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff);
|
||||
|
||||
ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
|
||||
@@ -454,12 +454,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
|
||||
ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i));
|
||||
ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i));
|
||||
ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
|
||||
ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
|
||||
ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
|
||||
ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
|
||||
ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
|
||||
ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
|
||||
ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
|
||||
}
|
||||
|
||||
set_param_lora(lora);
|
||||
@@ -497,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
|
||||
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
||||
ggml_set_zero(layer.ffn_norm_b);
|
||||
|
||||
randomize_tensor_normal(layer.w1_a, rnd);
|
||||
ggml_set_zero(layer.w1_b);
|
||||
randomize_tensor_normal(layer.w2_a, rnd);
|
||||
ggml_set_zero(layer.w2_b);
|
||||
randomize_tensor_normal(layer.w3_a, rnd);
|
||||
ggml_set_zero(layer.w3_b);
|
||||
randomize_tensor_normal(layer.ffn_gate_a, rnd);
|
||||
ggml_set_zero(layer.ffn_gate_b);
|
||||
randomize_tensor_normal(layer.ffn_down_a, rnd);
|
||||
ggml_set_zero(layer.ffn_down_b);
|
||||
randomize_tensor_normal(layer.ffn_up_a, rnd);
|
||||
ggml_set_zero(layer.ffn_up_b);
|
||||
}
|
||||
|
||||
free_random_normal_distribution(rnd);
|
||||
@@ -610,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
|
||||
struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
|
||||
struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
|
||||
struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
|
||||
struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
|
||||
struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
|
||||
struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
|
||||
struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
|
||||
struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
|
||||
struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
|
||||
struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
|
||||
struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
|
||||
struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
|
||||
struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
|
||||
struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
|
||||
struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
|
||||
struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
|
||||
|
||||
struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch);
|
||||
@@ -659,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
|
||||
cur = t30;
|
||||
if (enable_checkpointing) {
|
||||
@@ -723,9 +723,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
|
||||
}
|
||||
|
||||
// allocating checkpoints in one block to reduce memory fragmentation
|
||||
@@ -798,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
|
||||
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
|
||||
|
||||
init_lora(model, lora);
|
||||
|
||||
@@ -825,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
|
||||
copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b));
|
||||
copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
|
||||
copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
|
||||
copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a));
|
||||
copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b));
|
||||
copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a));
|
||||
copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b));
|
||||
copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a));
|
||||
copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b));
|
||||
copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
|
||||
copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
|
||||
copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
|
||||
copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
|
||||
copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
|
||||
copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -868,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down);
|
||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up);
|
||||
|
||||
gguf_add_tensor(fctx, lora->tok_embeddings_a);
|
||||
gguf_add_tensor(fctx, lora->tok_embeddings_b);
|
||||
@@ -894,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
|
||||
gguf_add_tensor(fctx, layer.wo_b);
|
||||
gguf_add_tensor(fctx, layer.ffn_norm_a);
|
||||
gguf_add_tensor(fctx, layer.ffn_norm_b);
|
||||
gguf_add_tensor(fctx, layer.w1_a);
|
||||
gguf_add_tensor(fctx, layer.w1_b);
|
||||
gguf_add_tensor(fctx, layer.w2_a);
|
||||
gguf_add_tensor(fctx, layer.w2_b);
|
||||
gguf_add_tensor(fctx, layer.w3_a);
|
||||
gguf_add_tensor(fctx, layer.w3_b);
|
||||
gguf_add_tensor(fctx, layer.ffn_gate_a);
|
||||
gguf_add_tensor(fctx, layer.ffn_gate_b);
|
||||
gguf_add_tensor(fctx, layer.ffn_down_a);
|
||||
gguf_add_tensor(fctx, layer.ffn_down_b);
|
||||
gguf_add_tensor(fctx, layer.ffn_up_a);
|
||||
gguf_add_tensor(fctx, layer.ffn_up_b);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1104,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
|
||||
write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB"));
|
||||
write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA"));
|
||||
write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB"));
|
||||
write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
|
||||
write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
|
||||
write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
|
||||
write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
|
||||
write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
|
||||
write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
|
||||
write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
|
||||
write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
|
||||
write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
|
||||
write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
|
||||
write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
|
||||
write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1139,9 +1139,9 @@ struct train_params {
|
||||
uint32_t n_rank_wv;
|
||||
uint32_t n_rank_wo;
|
||||
uint32_t n_rank_ffn_norm;
|
||||
uint32_t n_rank_w1;
|
||||
uint32_t n_rank_w2;
|
||||
uint32_t n_rank_w3;
|
||||
uint32_t n_rank_ffn_gate;
|
||||
uint32_t n_rank_ffn_down;
|
||||
uint32_t n_rank_ffn_up;
|
||||
uint32_t n_rank_tok_embeddings;
|
||||
uint32_t n_rank_norm;
|
||||
uint32_t n_rank_output;
|
||||
@@ -1152,9 +1152,9 @@ struct train_params {
|
||||
bool custom_n_rank_wv;
|
||||
bool custom_n_rank_wo;
|
||||
bool custom_n_rank_ffn_norm;
|
||||
bool custom_n_rank_w1;
|
||||
bool custom_n_rank_w2;
|
||||
bool custom_n_rank_w3;
|
||||
bool custom_n_rank_ffn_gate;
|
||||
bool custom_n_rank_ffn_down;
|
||||
bool custom_n_rank_ffn_up;
|
||||
bool custom_n_rank_tok_embeddings;
|
||||
bool custom_n_rank_norm;
|
||||
bool custom_n_rank_output;
|
||||
@@ -1186,9 +1186,9 @@ static struct train_params get_default_train_params() {
|
||||
params.n_rank_wv = 4;
|
||||
params.n_rank_wo = 4;
|
||||
params.n_rank_ffn_norm = 1;
|
||||
params.n_rank_w1 = 4;
|
||||
params.n_rank_w2 = 4;
|
||||
params.n_rank_w3 = 4;
|
||||
params.n_rank_ffn_gate = 4;
|
||||
params.n_rank_ffn_down = 4;
|
||||
params.n_rank_ffn_up = 4;
|
||||
params.n_rank_tok_embeddings = 4;
|
||||
params.n_rank_norm = 1;
|
||||
params.n_rank_output = 4;
|
||||
@@ -1199,9 +1199,9 @@ static struct train_params get_default_train_params() {
|
||||
params.custom_n_rank_wv = false;
|
||||
params.custom_n_rank_wo = false;
|
||||
params.custom_n_rank_ffn_norm = false;
|
||||
params.custom_n_rank_w1 = false;
|
||||
params.custom_n_rank_w2 = false;
|
||||
params.custom_n_rank_w3 = false;
|
||||
params.custom_n_rank_ffn_gate = false;
|
||||
params.custom_n_rank_ffn_down = false;
|
||||
params.custom_n_rank_ffn_up = false;
|
||||
params.custom_n_rank_tok_embeddings = false;
|
||||
params.custom_n_rank_norm = false;
|
||||
params.custom_n_rank_output = false;
|
||||
@@ -1232,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
|
||||
fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n");
|
||||
fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n");
|
||||
|
||||
print_common_train_usage(argc, argv, ¶ms->common);
|
||||
}
|
||||
@@ -1369,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
|
||||
}
|
||||
params->n_rank_wo = std::stoi(argv[i]);
|
||||
params->custom_n_rank_wo = true;
|
||||
} else if (arg == "--rank-w1") {
|
||||
} else if (arg == "--rank-ffn_gate") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params->n_rank_w1 = std::stoi(argv[i]);
|
||||
params->custom_n_rank_w1 = true;
|
||||
} else if (arg == "--rank-w2") {
|
||||
params->n_rank_ffn_gate = std::stoi(argv[i]);
|
||||
params->custom_n_rank_ffn_gate = true;
|
||||
} else if (arg == "--rank-ffn_down") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params->n_rank_w2 = std::stoi(argv[i]);
|
||||
params->custom_n_rank_w2 = true;
|
||||
} else if (arg == "--rank-w3") {
|
||||
params->n_rank_ffn_down = std::stoi(argv[i]);
|
||||
params->custom_n_rank_ffn_down = true;
|
||||
} else if (arg == "--rank-ffn_up") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params->n_rank_w3 = std::stoi(argv[i]);
|
||||
params->custom_n_rank_w3 = true;
|
||||
params->n_rank_ffn_up = std::stoi(argv[i]);
|
||||
params->custom_n_rank_ffn_up = true;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
train_print_usage(argc, argv, &default_params);
|
||||
@@ -1452,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
|
||||
nx += ggml_nelements(layer.wo_b);
|
||||
nx += ggml_nelements(layer.ffn_norm_a);
|
||||
nx += ggml_nelements(layer.ffn_norm_b);
|
||||
nx += ggml_nelements(layer.w1_a);
|
||||
nx += ggml_nelements(layer.w1_b);
|
||||
nx += ggml_nelements(layer.w2_a);
|
||||
nx += ggml_nelements(layer.w2_b);
|
||||
nx += ggml_nelements(layer.w3_a);
|
||||
nx += ggml_nelements(layer.w3_b);
|
||||
nx += ggml_nelements(layer.ffn_gate_a);
|
||||
nx += ggml_nelements(layer.ffn_gate_b);
|
||||
nx += ggml_nelements(layer.ffn_down_a);
|
||||
nx += ggml_nelements(layer.ffn_down_b);
|
||||
nx += ggml_nelements(layer.ffn_up_a);
|
||||
nx += ggml_nelements(layer.ffn_up_b);
|
||||
}
|
||||
return nx;
|
||||
}
|
||||
@@ -1511,9 +1511,9 @@ int main(int argc, char ** argv) {
|
||||
uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r;
|
||||
uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r;
|
||||
uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1;
|
||||
uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r;
|
||||
uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r;
|
||||
uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r;
|
||||
uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r;
|
||||
uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r;
|
||||
uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r;
|
||||
uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
|
||||
uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1;
|
||||
uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r;
|
||||
@@ -1523,9 +1523,9 @@ int main(int argc, char ** argv) {
|
||||
lora.hparams.n_rank_wv = n_rank_wv;
|
||||
lora.hparams.n_rank_wo = n_rank_wo;
|
||||
lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm;
|
||||
lora.hparams.n_rank_w1 = n_rank_w1;
|
||||
lora.hparams.n_rank_w2 = n_rank_w2;
|
||||
lora.hparams.n_rank_w3 = n_rank_w3;
|
||||
lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate;
|
||||
lora.hparams.n_rank_ffn_down = n_rank_ffn_down;
|
||||
lora.hparams.n_rank_ffn_up = n_rank_ffn_up;
|
||||
lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
|
||||
lora.hparams.n_rank_norm = n_rank_norm;
|
||||
lora.hparams.n_rank_output = n_rank_output;
|
||||
@@ -1566,9 +1566,9 @@ int main(int argc, char ** argv) {
|
||||
|| (lora.hparams.n_rank_wv != n_rank_wv)
|
||||
|| (lora.hparams.n_rank_wo != n_rank_wo)
|
||||
|| (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm)
|
||||
|| (lora.hparams.n_rank_w1 != n_rank_w1)
|
||||
|| (lora.hparams.n_rank_w2 != n_rank_w2)
|
||||
|| (lora.hparams.n_rank_w3 != n_rank_w3)
|
||||
|| (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate)
|
||||
|| (lora.hparams.n_rank_ffn_down != n_rank_ffn_down)
|
||||
|| (lora.hparams.n_rank_ffn_up != n_rank_ffn_up)
|
||||
|| (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
|
||||
|| (lora.hparams.n_rank_norm != n_rank_norm)
|
||||
|| (lora.hparams.n_rank_output != n_rank_output)
|
||||
|
||||
@@ -50,9 +50,9 @@ struct my_llama_layer {
|
||||
struct ggml_tensor * ffn_norm;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * w1;
|
||||
struct ggml_tensor * w2;
|
||||
struct ggml_tensor * w3;
|
||||
struct ggml_tensor * ffn_gate; // w1
|
||||
struct ggml_tensor * ffn_down; // w2
|
||||
struct ggml_tensor * ffn_up; // w3
|
||||
};
|
||||
|
||||
struct my_llama_model {
|
||||
@@ -140,9 +140,9 @@ static void set_param_model(struct my_llama_model * model) {
|
||||
ggml_set_param(ctx, layer.wv);
|
||||
ggml_set_param(ctx, layer.wo);
|
||||
ggml_set_param(ctx, layer.ffn_norm);
|
||||
ggml_set_param(ctx, layer.w1);
|
||||
ggml_set_param(ctx, layer.w2);
|
||||
ggml_set_param(ctx, layer.w3);
|
||||
ggml_set_param(ctx, layer.ffn_gate);
|
||||
ggml_set_param(ctx, layer.ffn_down);
|
||||
ggml_set_param(ctx, layer.ffn_up);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,9 +198,9 @@ static void init_model(struct my_llama_model * model) {
|
||||
|
||||
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
||||
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
||||
layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
|
||||
ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
|
||||
|
||||
@@ -211,9 +211,9 @@ static void init_model(struct my_llama_model * model) {
|
||||
|
||||
ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i));
|
||||
|
||||
ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i));
|
||||
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i));
|
||||
ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i));
|
||||
ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||
ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i));
|
||||
}
|
||||
|
||||
set_param_model(model);
|
||||
@@ -244,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
|
||||
|
||||
randomize_tensor_normal(layer.ffn_norm, rnd);
|
||||
|
||||
randomize_tensor_normal(layer.w1, rnd);
|
||||
randomize_tensor_normal(layer.w2, rnd);
|
||||
randomize_tensor_normal(layer.w3, rnd);
|
||||
randomize_tensor_normal(layer.ffn_gate, rnd);
|
||||
randomize_tensor_normal(layer.ffn_down, rnd);
|
||||
randomize_tensor_normal(layer.ffn_up, rnd);
|
||||
}
|
||||
|
||||
free_random_normal_distribution(rnd);
|
||||
@@ -356,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
|
||||
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
|
||||
cur = t30;
|
||||
checkpoints.push_back(cur);
|
||||
@@ -521,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
|
||||
copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
|
||||
copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
|
||||
copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
|
||||
copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
|
||||
copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||
copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
|
||||
copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
|
||||
copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
|
||||
copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -664,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
|
||||
gguf_add_tensor(fctx, layer.wv);
|
||||
gguf_add_tensor(fctx, layer.wo);
|
||||
gguf_add_tensor(fctx, layer.ffn_norm);
|
||||
gguf_add_tensor(fctx, layer.w1);
|
||||
gguf_add_tensor(fctx, layer.w2);
|
||||
gguf_add_tensor(fctx, layer.w3);
|
||||
gguf_add_tensor(fctx, layer.ffn_gate);
|
||||
gguf_add_tensor(fctx, layer.ffn_down);
|
||||
gguf_add_tensor(fctx, layer.ffn_up);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -915,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
|
||||
nx += ggml_nelements(layer.wv);
|
||||
nx += ggml_nelements(layer.wo);
|
||||
nx += ggml_nelements(layer.ffn_norm);
|
||||
nx += ggml_nelements(layer.w1);
|
||||
nx += ggml_nelements(layer.w2);
|
||||
nx += ggml_nelements(layer.w3);
|
||||
nx += ggml_nelements(layer.ffn_gate);
|
||||
nx += ggml_nelements(layer.ffn_down);
|
||||
nx += ggml_nelements(layer.ffn_up);
|
||||
}
|
||||
return nx;
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ class Keys:
|
||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||
POOLING_LAYER = "{arch}.pooling_layer"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
|
||||
@@ -360,6 +360,9 @@ class GGUFWriter:
|
||||
def add_causal_attention(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||
|
||||
def add_pooling_layer(self, value: bool) -> None:
|
||||
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_dimension_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||
|
||||
|
||||
@@ -254,6 +254,7 @@ enum llm_kv {
|
||||
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
LLM_KV_EXPERT_USED_COUNT,
|
||||
LLM_KV_POOLING_LAYER,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
@@ -311,6 +312,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
@@ -1539,6 +1541,7 @@ struct llama_hparams {
|
||||
float f_max_alibi_bias;
|
||||
|
||||
bool causal_attn = true;
|
||||
bool pooling_layer = false;
|
||||
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
@@ -1601,6 +1604,7 @@ struct llama_cparams {
|
||||
|
||||
bool mul_mat_q;
|
||||
bool offload_kqv;
|
||||
bool do_pooling;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
@@ -1896,7 +1900,7 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||
struct ggml_tensor * inp_sum; // F32 [1, n_batch]
|
||||
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
@@ -3053,6 +3057,7 @@ static void llm_load_hparams(
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 3:
|
||||
@@ -3309,7 +3314,12 @@ static void llm_load_vocab(
|
||||
|
||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||
try {
|
||||
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||
} catch (const std::exception & e) {
|
||||
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
||||
vocab.linefeed_id = vocab.special_pad_id;
|
||||
}
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||
vocab.linefeed_id = vocab.special_pad_id;
|
||||
} else {
|
||||
@@ -4379,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
|
||||
llm_load_arch (ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
llm_load_vocab (ml, model);
|
||||
try {
|
||||
llm_load_arch(ml, model);
|
||||
} catch(const std::exception & e) {
|
||||
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
||||
}
|
||||
try {
|
||||
llm_load_hparams(ml, model);
|
||||
} catch(const std::exception & e) {
|
||||
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
||||
}
|
||||
try {
|
||||
llm_load_vocab(ml, model);
|
||||
} catch(const std::exception & e) {
|
||||
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
||||
}
|
||||
|
||||
llm_load_print_meta(ml, model);
|
||||
|
||||
@@ -4859,7 +4881,7 @@ struct llm_build_context {
|
||||
const int32_t n_orig_ctx;
|
||||
|
||||
const bool do_rope_shift;
|
||||
const bool causal_attn;
|
||||
const bool do_pooling;
|
||||
|
||||
const llm_build_cb & cb;
|
||||
|
||||
@@ -4903,7 +4925,7 @@ struct llm_build_context {
|
||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||
do_rope_shift (worst_case || kv_self.has_shift),
|
||||
causal_attn (hparams.causal_attn),
|
||||
do_pooling (hparams.pooling_layer && cparams.do_pooling),
|
||||
cb (cb),
|
||||
buf_compute_meta (lctx.buf_compute_meta) {
|
||||
// all initializations should be done in init()
|
||||
@@ -5752,17 +5774,18 @@ struct llm_build_context {
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
// get input vectors with right size
|
||||
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
|
||||
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
|
||||
|
||||
// construct input embeddings (token, type, position)
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
|
||||
// token types are hardcoded to zero ("Sentence A")
|
||||
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
||||
inpL = ggml_add(ctx0, inpL, type_row0);
|
||||
@@ -5832,9 +5855,11 @@ struct llm_build_context {
|
||||
// final output
|
||||
cur = inpL;
|
||||
|
||||
// pooling
|
||||
cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
|
||||
cb(cur, "result_embed", -1);
|
||||
// pooling layer
|
||||
if (do_pooling) {
|
||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
|
||||
}
|
||||
cb(cur, "result_embd", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
@@ -7367,7 +7392,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
float f;
|
||||
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
||||
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
||||
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
||||
f = -INFINITY;
|
||||
} else {
|
||||
f = 0;
|
||||
@@ -7378,7 +7404,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
||||
float * data = (float *) lctx.inp_sum->data;
|
||||
@@ -7399,6 +7424,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
data[i] = lctx.kv_self.cells[i].delta;
|
||||
}
|
||||
}
|
||||
|
||||
if (hparams.pooling_layer && cparams.do_pooling) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
||||
float * data = (float *) lctx.inp_sum->data;
|
||||
|
||||
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
data[seq_id*n_tokens + i] = 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// decode a batch of tokens by evaluating the transformer
|
||||
@@ -7510,7 +7549,7 @@ static int llama_decode_internal(
|
||||
embeddings = gf->nodes[gf->n_nodes - 3];
|
||||
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
||||
}
|
||||
} else if (strcmp(res->name, "result_embed") == 0) {
|
||||
} else if (strcmp(res->name, "result_embd") == 0) {
|
||||
embeddings = res;
|
||||
res = nullptr;
|
||||
} else {
|
||||
@@ -7630,11 +7669,12 @@ static int llama_decode_internal(
|
||||
if (!lctx.embedding.empty()) {
|
||||
auto & embedding_out = lctx.embedding;
|
||||
|
||||
const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
|
||||
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
||||
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
||||
|
||||
embedding_out.resize(n_embd);
|
||||
embedding_out.resize(embd_size);
|
||||
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
||||
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
|
||||
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
||||
ggml_backend_synchronize(embeddings_backend);
|
||||
}
|
||||
|
||||
@@ -7711,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||
switch (llama_vocab_get_type(vocab)) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: {
|
||||
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||
return vocab.token_to_id.at(buf);
|
||||
auto token = vocab.token_to_id.find(buf);
|
||||
if (token != vocab.token_to_id.end()) {
|
||||
return (*token).second;
|
||||
}
|
||||
// Try to fall back to just the byte as a string
|
||||
const char buf2[2] = { (char)ch, 0 };
|
||||
return vocab.token_to_id.at(buf2);
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
@@ -7759,7 +7805,7 @@ struct llm_bigram_spm {
|
||||
};
|
||||
|
||||
struct llm_tokenizer_spm {
|
||||
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
|
||||
llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
// split string into utf8 chars
|
||||
@@ -7834,6 +7880,7 @@ private:
|
||||
|
||||
if (p == rev_merge.end()) {
|
||||
// output any symbols that did not form tokens as bytes.
|
||||
output.reserve(output.size() + symbol.n);
|
||||
for (int j = 0; j < (int)symbol.n; ++j) {
|
||||
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
||||
output.push_back(token_id);
|
||||
@@ -8396,17 +8443,18 @@ struct fragment_buffer_variant {
|
||||
token(_token),
|
||||
raw_text(_dummy),
|
||||
offset(0),
|
||||
length(0){}
|
||||
length(0) {}
|
||||
|
||||
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
||||
:
|
||||
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
||||
token((llama_vocab::id)-1),
|
||||
token((llama_vocab::id) - 1),
|
||||
raw_text(_raw_text),
|
||||
offset(_offset),
|
||||
length(_length){
|
||||
GGML_ASSERT( _offset >= 0 );
|
||||
GGML_ASSERT( _length >= 1 );
|
||||
GGML_ASSERT( offset + length <= raw_text.length() );
|
||||
GGML_ASSERT(_offset >= 0);
|
||||
GGML_ASSERT(_length >= 1);
|
||||
GGML_ASSERT(offset + length <= raw_text.length());
|
||||
}
|
||||
|
||||
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
||||
@@ -8530,14 +8578,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
|
||||
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
||||
fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
|
||||
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
||||
|
||||
if (special) tokenizer_st_partition( vocab, fragment_buffer );
|
||||
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
||||
|
||||
switch (vocab.type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM:
|
||||
{
|
||||
for (const auto & fragment: fragment_buffer) {
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||
|
||||
@@ -8565,7 +8613,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
{
|
||||
for (const auto & fragment: fragment_buffer) {
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
@@ -8581,7 +8629,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
{
|
||||
for (const auto & fragment: fragment_buffer) {
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
@@ -10444,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
quantize &= !params->only_copy;
|
||||
|
||||
// do not quantize expert gating tensors
|
||||
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
|
||||
|
||||
// do not quantize positional embeddings and token types (BERT)
|
||||
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
||||
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
@@ -10946,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.logits_all =*/ false,
|
||||
/*.embedding =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.do_pooling =*/ true,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -11101,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.do_pooling = params.do_pooling;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||
@@ -11248,7 +11302,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
// resized during inference, reserve maximum
|
||||
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
||||
|
||||
if (params.embedding){
|
||||
if (params.embedding) {
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
@@ -11266,7 +11320,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
|
||||
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
||||
|
||||
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
||||
ggml_set_name(ctx->inp_embd, "inp_embd");
|
||||
@@ -12124,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
||||
return ctx->embedding.data();
|
||||
}
|
||||
|
||||
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
||||
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
|
||||
}
|
||||
|
||||
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
||||
return model->vocab.id_to_token[token].text.c_str();
|
||||
}
|
||||
|
||||
@@ -236,6 +236,7 @@ extern "C" {
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embedding; // embedding mode only
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@@ -628,6 +629,10 @@ extern "C" {
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the ith sequence
|
||||
// llama_get_embeddings(ctx) + i*n_embd
|
||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
//
|
||||
// Vocab
|
||||
//
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
#include "console.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <codecvt>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
@@ -74,45 +74,46 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument &) {
|
||||
fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
||||
//fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
||||
// NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
|
||||
if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
|
||||
std::string str = " " + codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Restrict to assigned unicode planes
|
||||
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
// unicode
|
||||
{
|
||||
const int nthread = std::thread::hardware_concurrency();
|
||||
|
||||
std::vector<std::thread> threads(nthread);
|
||||
|
||||
for (int i = 0; i < nthread; ++i) {
|
||||
threads[i] = std::thread([i, nthread, ctx]() {
|
||||
for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
|
||||
if (!( // NOLINT
|
||||
(cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 &&
|
||||
(cp < 0x13 || cp > 0x17) && cp != 0x19 &&
|
||||
(cp < 0x1c || cp > 0x1e) &&
|
||||
(cp < 0xd800 || cp > 0xdfff) &&
|
||||
(cp < 0x00040000 || cp >= 0x000e0000)
|
||||
)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (cp != 9601 && str != check) {
|
||||
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
std::exit(3);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (auto & t : threads) {
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
#include "console.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <codecvt>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <codecvt>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
@@ -72,26 +72,33 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
||||
if (cp < 0xd800 || cp > 0xdfff) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (cp != 9601 && str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 3;
|
||||
}
|
||||
// unicode
|
||||
{
|
||||
const int nthread = std::thread::hardware_concurrency();
|
||||
|
||||
std::vector<std::thread> threads(nthread);
|
||||
|
||||
for (int i = 0; i < nthread; ++i) {
|
||||
threads[i] = std::thread([i, nthread, ctx]() {
|
||||
for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
|
||||
if (cp >= 0xd800 && cp <= 0xdfff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (cp != 9601 && str != check) {
|
||||
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
std::exit(3);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
|
||||
for (auto & t : threads) {
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
offset += 1;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x40)) {
|
||||
if (!(utf8[offset + 0] & 0x40)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x20)) {
|
||||
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
|
||||
if (!(utf8[offset + 0] & 0x20)) {
|
||||
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
||||
offset += 2;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x10)) {
|
||||
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
|
||||
if (!(utf8[offset + 0] & 0x10)) {
|
||||
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
||||
offset += 3;
|
||||
return result;
|
||||
}
|
||||
else if (!(utf8[offset + 0] & 0x08)) {
|
||||
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
|
||||
if (!(utf8[offset + 0] & 0x08)) {
|
||||
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
||||
offset += 4;
|
||||
return result;
|
||||
@@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
|
||||
offset += 1;
|
||||
return result;
|
||||
}
|
||||
else {
|
||||
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
|
||||
throw std::invalid_argument("invalid character");
|
||||
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
||||
offset += 2;
|
||||
return result;
|
||||
|
||||
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
|
||||
throw std::invalid_argument("invalid character");
|
||||
}
|
||||
throw std::invalid_argument("invalid string");
|
||||
|
||||
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
||||
offset += 2;
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
||||
std::vector<uint32_t> result;
|
||||
size_t offset = 0;
|
||||
while (offset < utf16.size())
|
||||
while (offset < utf16.size()) {
|
||||
result.push_back(codepoint_from_utf16(utf16, offset));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
|
||||
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||
std::unordered_map<uint32_t, int> codepoint_types;
|
||||
for (auto p : digit_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
||||
}
|
||||
}
|
||||
for(auto p : letter_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
for (auto p : letter_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
||||
}
|
||||
}
|
||||
for(auto p : whitespace_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
for (auto p : whitespace_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
||||
}
|
||||
}
|
||||
for(auto p : accent_mark_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
for (auto p : accent_mark_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||
}
|
||||
}
|
||||
for(auto p : punctuation_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
for (auto p : punctuation_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||
}
|
||||
}
|
||||
for (auto p : symbol_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++i)
|
||||
for (auto p : symbol_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
||||
}
|
||||
}
|
||||
for(auto p : control_ranges) {
|
||||
for(auto i = p.first; i <= p.second; ++ i)
|
||||
for (auto p : control_ranges) {
|
||||
for (auto i = p.first; i <= p.second; ++ i) {
|
||||
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||
}
|
||||
}
|
||||
return codepoint_types;
|
||||
}
|
||||
|
||||
static int codepoint_type(uint32_t cp) {
|
||||
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||
return codepoint_types[cp];
|
||||
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
||||
}
|
||||
|
||||
static int codepoint_type(const std::string & utf8) {
|
||||
if (utf8.length() == 0)
|
||||
if (utf8.length() == 0) {
|
||||
return CODEPOINT_TYPE_UNIDENTIFIED;
|
||||
}
|
||||
size_t offset = 0;
|
||||
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user