swift : fix concatenation method to avoid invalid UTF8 stringfication (#4325 )

swift : fix prompt tokenization logic (#4321 )
grammar-parser : fix typo (#4318 )
2023-12-04 18:03:49 +02:00 · 2023-12-04 15:43:45 +02:00 · 2023-12-04 09:57:35 +02:00 · 2023-12-03 15:56:35 +02:00 · 2023-12-03 15:56:22 +02:00 · 2023-12-03 11:10:43 +02:00
8 changed files with 57 additions and 65 deletions
@@ -190,7 +190,7 @@ namespace grammar_parser {
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                }

                // apply transformation to previous symbol (last_sym_start to end) according to
@@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
 llama_print_timings(context)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let n_tokens = text.count + (add_bos ? 1 : 0)
+    let utf8Count = text.utf8.count
+    let n_tokens = utf8Count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
@@ -11,6 +11,8 @@ actor LlamaContext {
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
+    /// This variable is used to store temporarily invalid cchars
+    private var temporary_invalid_cchars: [CChar]

    var n_len: Int32 = 512
    var n_cur: Int32 = 0
@@ -21,6 +23,7 @@ actor LlamaContext {
        self.context = context
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
+        self.temporary_invalid_cchars = []
    }

    deinit {
@@ -61,6 +64,7 @@ actor LlamaContext {
        print("attempting to complete \"\(text)\"")

        tokens_list = tokenize(text: text, add_bos: true)
+        temporary_invalid_cchars = []

        let n_ctx = llama_n_ctx(context)
        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
@@ -72,7 +76,7 @@ actor LlamaContext {
        }

        for id in tokens_list {
-            print(token_to_piece(token: id))
+            print(String(cString: token_to_piece(token: id) + [0]))
        }

        // batch = llama_batch_init(512, 0) // done in init()
@@ -115,10 +119,25 @@ actor LlamaContext {

        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            print("\n")
-            return ""
+            let new_token_str = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            return new_token_str
        }

-        let new_token_str = token_to_piece(token: new_token_id)
+        let new_token_cchars = token_to_piece(token: new_token_id)
+        temporary_invalid_cchars.append(contentsOf: new_token_cchars)
+        let new_token_str: String
+        if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
+            // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
+            let string = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else {
+            new_token_str = ""
+        }
        print(new_token_str)
        // tokens_list.append(new_token_id)

@@ -144,12 +163,14 @@ actor LlamaContext {

    func clear() {
        tokens_list.removeAll()
+        temporary_invalid_cchars.removeAll()
    }

    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-        let n_tokens = text.count + (add_bos ? 1 : 0)
+        let utf8Count = text.utf8.count
+        let n_tokens = utf8Count + (add_bos ? 1 : 0)
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

        var swiftTokens: [llama_token] = []
        for i in 0..<tokenCount {
@@ -161,7 +182,8 @@ actor LlamaContext {
        return swiftTokens
    }

-    private func token_to_piece(token: llama_token) -> String {
+    /// - note: The result does not contain null-terminator
+    private func token_to_piece(token: llama_token) -> [CChar] {
        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
        result.initialize(repeating: Int8(0), count: 8)
        defer {
@@ -175,10 +197,12 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            _ = llama_token_to_piece(model, token, newResult, -nTokens)
-            return String(cString: newResult)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
+            return Array(bufferPointer)
        } else {
-            return String(cString: result)
+            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
+            return Array(bufferPointer)
        }
    }
 }
@@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False):
    if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
    if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
    if(is_present(body, "seed")): postData["seed"] = body["seed"]
+    if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
    if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
    if (args.stop != ""):
        postData["stop"] = [args.stop]
@@ -1469,7 +1469,7 @@ struct llama_server_context

    int split_multiprompt_task(task_server& multiprompt_task)
    {
-        auto prompt_count = multiprompt_task.data.at("prompt").size();
+        int prompt_count = multiprompt_task.data.at("prompt").size();
        assert(prompt_count > 1);

        int multitask_id = id_gen++;
@@ -2410,9 +2410,7 @@ json oaicompat_completion_params_parse(
    }

    // Handle 'stop' field
-    if (body["stop"].is_null()) {
-        llama_params["stop"] = json::array({});
-    } else if (body["stop"].is_string()) {
+    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
@@ -1083,7 +1083,7 @@ void ggml_metal_graph_compute(

                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                            // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
+                            int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16;

 #if 0
                            // the numbers below are measured on M2 Ultra for 7B and 13B models
@@ -15629,7 +15629,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            } break;
        case GGML_OP_DIAG_MASK_ZERO:
        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
        case GGML_OP_SOFT_MAX_BACK:
        case GGML_OP_ROPE:
        case GGML_OP_ROPE_BACK:
@@ -15645,6 +15644,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            {
                n_tasks = 1; //TODO
            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
+            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                n_tasks = n_threads;
@@ -15876,18 +15879,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {

    // thread scheduling for the different operations + work buffer size estimation
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        int n_tasks = 1;
-
        struct ggml_tensor * node = cgraph->nodes[i];

+        const int n_tasks = ggml_get_n_tasks(node, n_threads);
+
        size_t cur = 0;

        switch (node->op) {
            case GGML_OP_CPY:
            case GGML_OP_DUP:
                {
-                    n_tasks = n_threads;
-
                    if (ggml_is_quantized(node->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                    }
@@ -15895,16 +15896,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
            case GGML_OP_ADD:
            case GGML_OP_ADD1:
                {
-                    n_tasks = n_threads;
-
                    if (ggml_is_quantized(node->src[0]->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                    }
                } break;
            case GGML_OP_ACC:
                {
-                    n_tasks = n_threads;
-
                    if (ggml_is_quantized(node->src[0]->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                    }
@@ -15932,16 +15929,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                } break;
            case GGML_OP_OUT_PROD:
                {
-                    n_tasks = n_threads;
-
                    if (ggml_is_quantized(node->src[0]->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                    }
                } break;
            case GGML_OP_SOFT_MAX:
                {
-                    n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
-
                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                } break;
            case GGML_OP_CONV_TRANSPOSE_1D:
@@ -15971,7 +15964,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                } break;
            case GGML_OP_IM2COL:
                {
-                    n_tasks = n_threads;
                } break;
            case GGML_OP_CONV_TRANSPOSE_2D:
                {
@@ -15989,8 +15981,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                } break;
            case GGML_OP_FLASH_ATTN:
                {
-                    n_tasks = n_threads;
-
                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);

                    if (node->src[1]->type == GGML_TYPE_F32) {
@@ -16003,8 +15993,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                } break;
            case GGML_OP_FLASH_FF:
                {
-                    n_tasks = n_threads;
-
                    if (node->src[1]->type == GGML_TYPE_F32) {
                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
@@ -16015,8 +16003,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                } break;
            case GGML_OP_FLASH_ATTN_BACK:
                {
-                    n_tasks = n_threads;
-
                    const int64_t    D = node->src[0]->ne[0];
                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
@@ -16031,8 +16017,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {

            case GGML_OP_CROSS_ENTROPY_LOSS:
                {
-                    n_tasks = n_threads;
-
                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
                } break;
            case GGML_OP_COUNT:
@@ -1991,10 +1991,13 @@ struct llama_model_loader {
        return tensor;
    }

-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
+    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
        struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());

        if (cur == NULL) {
+            if (!required) {
+                return NULL;
+            }
            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
        }

@@ -2812,29 +2815,11 @@ static void llm_load_tensors(
                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);

-                        try {
-                            layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw;
-                        }
-
-                        try {
-                            layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw;
-                        }
-
-                        try {
-                            layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw;
-                        }
-
-                        try {
-                            layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
-                        } catch (const std::runtime_error& e) {
-                            if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw;
-                        }
+                        // optional bias tensors
+                        layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     backend, false);
+                        layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, backend, false);
+                        layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, backend, false);
+                        layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     backend, false);

                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);

@@ -5759,8 +5744,7 @@ static int llama_decode_internal(
    // a heuristic, to avoid attending the full cache if it is not yet utilized
    // after enough generations, the benefit from this heuristic disappears
    // if we start defragmenting the cache, the benefit from this will be more important
-    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
+    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));

    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
Author	SHA1	Message	Date
Miwa / Ensan	d208995c6d	swift : fix concatenation method to avoid invalid UTF8 stringfication (#4325 )	2023-12-04 18:03:49 +02:00
Miwa / Ensan	5c9f90cba1	swift : fix prompt tokenization logic (#4321 )	2023-12-04 15:43:45 +02:00
Ikko Eltociear Ashimine	4fa44e84ad	grammar-parser : fix typo (#4318 ) preceeding -> preceding	2023-12-04 09:57:35 +02:00
Georgi Gerganov	fbbc42827b	ggml : reuse ggml_get_n_tasks() in ggml_graph_plan() (#4308 ) * ggml : fix soft max out-of-bounds access ggml-ci * ggml : reuse ggml_get_n_tasks() in ggml_graph_plan() ggml-ci	2023-12-03 15:56:35 +02:00
Georgi Gerganov	adf3de4f69	ggml : fix soft max out-of-bounds access (#4307 ) ggml-ci	2023-12-03 15:56:22 +02:00
Ed Lee	33e171d1e9	server : fix OpenAI API `stop` field to be optional (#4299 ) (cherry picked from commit Mozilla-Ocho/llamafile@e8c92bcb84)	2023-12-03 11:10:43 +02:00
Rickard Edén	6949b50df5	py : add grammar to oai like api (#4294 )	2023-12-03 11:03:25 +02:00
Georgi Gerganov	d7b800b8bc	llama : pad KV cache size (#4280 ) * llama : pad KV cache size to 32 * metal : try to improve batched decoding	2023-12-03 10:58:16 +02:00
Georgi Gerganov	5a7d3125e7	llama : avoid using "optional" keyword (#4283 )	2023-12-01 20:39:12 +02:00
Georgi Gerganov	d5a1cbde60	llama : support optional tensors (#4283 )	2023-12-01 20:35:47 +02:00