server : add /detokenize endpoint (#2802 )

* Add a /detokenize endpoint to the example server * remove trailing white-space
convert.py : advanced option (#2753 )
2026-06-14 09:46:43 +02:00 · 2023-08-27 07:11:45 +08:00 · 2023-08-26 23:13:36 +03:00 · 2023-08-26 21:27:07 +03:00 · 2023-08-26 21:19:44 +03:00 · 2023-08-26 21:17:51 +03:00
9 changed files with 287 additions and 170 deletions
@@ -3,6 +3,7 @@
 import gguf
 import argparse
 import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 import copy
 import enum
 import faulthandler
@@ -17,13 +18,14 @@ import re
 import signal
 import struct
 import sys
+import time
 import zipfile
 import numpy as np

 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union)
 from sentencepiece import SentencePieceProcessor  # type: ignore

 if TYPE_CHECKING:
@@ -37,30 +39,70 @@ NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
 ARCH=gguf.MODEL_ARCH.LLAMA
 NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]

+DEFAULT_CONCURRENCY = 8
 #
 # data types
 #

@dataclass(frozen=True)
-class UnquantizedDataType:
+class DataType:
    name: str
+    dtype: 'np.dtype[Any]'
+    valid_conversions: List[str]

-DT_F16  = UnquantizedDataType('F16')
-DT_F32  = UnquantizedDataType('F32')
-DT_I32  = UnquantizedDataType('I32')
-DT_BF16 = UnquantizedDataType('BF16')
+    def elements_to_bytes(self, n_elements: int) -> int:
+        return n_elements * self.dtype.itemsize

-DataType = Union[UnquantizedDataType]
+@dataclass(frozen=True)
+class UnquantizedDataType(DataType):
+    pass

-DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
-    DT_BF16: np.dtype(np.uint16),
-    DT_F16:  np.dtype(np.float16),
-    DT_F32:  np.dtype(np.float32),
-    DT_I32:  np.dtype(np.int32),
-}
+DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
+DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
+DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])

-NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
-    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+@dataclass(frozen=True)
+class QuantizedDataType(DataType):
+    block_size: int
+    quantized_dtype: 'np.dtype[Any]'
+    ggml_type: gguf.GGMLQuantizationType
+
+    def quantize(self, arr: NDArray) -> NDArray:
+        raise NotImplementedError(f'Quantization for {self.name} not implemented')
+
+    def elements_to_bytes(self, n_elements: int) -> int:
+        assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
+        return self.quantized_dtype.itemsize * (n_elements // self.block_size)
+
+@dataclass(frozen=True)
+class Q8_0QuantizedDataType(QuantizedDataType):
+    # Mini Q8_0 quantization in Python!
+    def quantize(self, arr: NDArray) -> NDArray:
+        assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
+        assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
+        n_blocks = arr.size // self.block_size
+        blocks = arr.reshape((n_blocks, self.block_size))
+        # Much faster implementation of block quantization contributed by @Cebtenzzre
+        def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]:
+            d = abs(blocks).max(axis = 1) / np.float32(127)
+            with np.errstate(divide = 'ignore'):
+                qs = (blocks / d[:, None]).round()
+            qs[d == 0] = 0
+            yield from zip(d, qs)
+        return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
+
+DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
+    dtype = np.dtype(np.float32), valid_conversions = [],
+    ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
+    quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
+
+# Quantized types skipped here because they may also map to np.float32
+NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = {}
+for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
+    if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
+        raise ValueError(f'Invalid duplicate data type {dt}')
+    NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt

 SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
    'BF16': DT_BF16,
@@ -73,20 +115,22 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
 # TODO: rename to LLAMAFileType
 # TODO: move to `gguf.py`
 class GGMLFileType(enum.IntEnum):
-    AllF32    = 0
-    MostlyF16 = 1  # except 1d tensors
+    AllF32     = 0
+    MostlyF16  = 1  # except 1d tensors
+    MostlyQ8_0 = 7  # except 1d tensors

    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
-        if len(tensor.shape) == 1:
-            # 1D tensors are always F32.
-            return DT_F32
-        elif self == GGMLFileType.AllF32:
-            return DT_F32
-        elif self == GGMLFileType.MostlyF16:
-            return DT_F16
-        else:
+        dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
+        if dt is None:
            raise ValueError(self)
+        # 1D tensors are always F32.
+        return dt if len(tensor.shape) > 1 else DT_F32

+GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = {
+    GGMLFileType.AllF32    : DT_F32,
+    GGMLFileType.MostlyF16 : DT_F16,
+    GGMLFileType.MostlyQ8_0: DT_Q8_0,
+}

 #
 # hparams loading
@@ -170,7 +214,8 @@ class Params:
        f_norm_eps       = config["rms_norm_eps"]
        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None

-        if "rope_scaling" in config and config["rope_scaling"].get("type") == "linear":
+        rope_scaling = config.get("rope_scaling")
+        if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
            f_rope_scale = config["rope_scaling"].get("factor")
        else:
            f_rope_scale = None
@@ -414,7 +459,7 @@ class UnquantizedTensor(Tensor):
        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]

    def astype(self, data_type: DataType) -> Tensor:
-        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        dtype = data_type.dtype
        if self.data_type == DT_BF16:
            self.ndarray = bf16_to_fp32(self.ndarray)
        return UnquantizedTensor(self.ndarray.astype(dtype))
@@ -453,22 +498,6 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
 GGMLCompatibleTensor = Union[UnquantizedTensor]


-class DeferredPermutedTensor(Tensor):
-    def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None:
-        self.base = base
-        self.n_head = n_head
-        self.data_type = self.base.data_type
-
-    def astype(self, data_type: DataType) -> Tensor:
-        return self.base.astype(data_type).permute(self.n_head, self.n_head_kv)
-
-    def to_ggml(self) -> GGMLCompatibleTensor:
-        return self.base.to_ggml().permute(self.n_head, self.n_head_kv)
-
-    def permute(self, n_head: int, n_head_kv: int) -> Tensor:
-        raise Exception("shouldn't permute twice")
-
-
@dataclass
 class LazyTensor:
    _load: Callable[[], Tensor]
@@ -478,7 +507,9 @@ class LazyTensor:

    def load(self) -> Tensor:
        ret = self._load()
-        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        # Should be okay if it maps to the same numpy type?
+        assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
+                (self.data_type, ret.data_type, self.description)
        return ret

    def astype(self, data_type: DataType) -> 'LazyTensor':
@@ -489,8 +520,8 @@ class LazyTensor:
        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')

    def validate_conversion_to(self, data_type: DataType) -> None:
-        if data_type == self.data_type:
-            return
+        if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
+            raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')


 LazyModel = Dict[str, LazyTensor]
@@ -616,9 +647,7 @@ class LazyUnpickler(pickle.Unpickler):
        info = self.zip_file.getinfo(filename)

        def load(offset: int, elm_count: int) -> NDArray:
-            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
-            if dtype is None:
-                raise Exception("tensor stored in unsupported format")
+            dtype = data_type.dtype
            fp = self.zip_file.open(info)
            fp.seek(offset * dtype.itemsize)
            size = elm_count * dtype.itemsize
@@ -682,7 +711,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:

    def convert(info: Dict[str, Any]) -> LazyTensor:
        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
-        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        numpy_dtype = data_type.dtype
        shape: List[int] = info['shape']
        begin, end = info['data_offsets']
        assert 0 <= begin <= end <= len(byte_buf)
@@ -722,23 +751,35 @@ def lazy_load_file(path: Path) -> ModelPlus:
 In = TypeVar('In')
 Out = TypeVar('Out')

-def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]:
    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
    fast enough, this will stop calling `func` at some point rather than
    letting results pile up in memory.  Specifically, there is a max of one
    output value buffered per thread.'''
-    with concurrent.futures.ThreadPoolExecutor() as executor:
+    if concurrency < 2:
+        yield from map(func, iterable)
+        # Not reached.
+    iterable = iter(iterable)
+    with factory(max_workers = max_workers) as executor:
        futures: List[concurrent.futures.Future[Out]] = []
-        items_rev = list(iterable)[::-1]
-        for i in range(min(concurrency, len(items_rev))):
-            futures.append(executor.submit(func, items_rev.pop()))
+        done = False
+        for _ in range(concurrency):
+            try:
+                futures.append(executor.submit(func, next(iterable)))
+            except StopIteration:
+                done = True
+                break
+
        while futures:
            result = futures.pop(0).result()
-            if items_rev:
-                futures.append(executor.submit(func, items_rev.pop()))
+            while not done and len(futures) < concurrency:
+                try:
+                    futures.append(executor.submit(func, next(iterable)))
+                except StopIteration:
+                    done = True
+                    break
            yield result

-
 def check_vocab_size(params: Params, vocab: Vocab) -> None:
    if params.n_vocab != vocab.vocab_size:
        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
@@ -803,12 +844,11 @@ class OutputFile:
        self.gguf.add_token_types(toktypes)

    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
-        n_elements = 1
-        for dim in tensor.shape:
-            n_elements *= dim
-        data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
-        data_nbytes = n_elements * data_type.itemsize
-        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
+        n_elements = int(np.prod(tensor.shape))
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
+        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)

    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@@ -834,7 +874,20 @@ class OutputFile:
        of.close()

    @staticmethod
-    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+    def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]:
+        name, lazy_tensor = item
+        tensor = lazy_tensor.load().to_ggml()
+        return (lazy_tensor.data_type, tensor.ndarray)
+
+    @staticmethod
+    def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray:
+        dt, arr = item
+        if not isinstance(dt, QuantizedDataType):
+            return arr
+        return dt.quantize(arr)
+
+    @staticmethod
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
        check_vocab_size(params, vocab)

        of = OutputFile(fname_out)
@@ -850,16 +903,19 @@ class OutputFile:
        of.write_meta()
        of.write_tensor_info()

-        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
-            name, lazy_tensor = item
-            return lazy_tensor.load().to_ggml().ndarray
-
        # tensor data
-        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
+        if ftype == GGMLFileType.MostlyQ8_0:
+            ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor)
+        else:
+            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
+
+        start = time.time()
        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            elapsed = time.time() - start
            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
-            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
            of.gguf.write_tensor_data(ndarray)

        of.close()
@@ -871,6 +927,8 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
        return GGMLFileType.AllF32
    if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
        return GGMLFileType.MostlyF16
+    if output_type_str == "q8_0":
+        return GGMLFileType.MostlyQ8_0

    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}

@@ -917,7 +975,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
            print(f"skipping tensor {name_new}")
            continue
        else:
-            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}")
+            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
            out[name_new] = lazy_tensor

    return out
@@ -1022,6 +1080,7 @@ def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
    namestr = {
        GGMLFileType.AllF32:    "f32",
        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ8_0:"q8_0",
    }[file_type]
    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
@@ -1045,12 +1104,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--outtype",     choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
    args = parser.parse_args(args_in)

    if args.dump_single:
@@ -1072,6 +1132,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        params.ftype = {
            "f32": GGMLFileType.AllF32,
            "f16": GGMLFileType.MostlyF16,
+            "q8_0": GGMLFileType.MostlyQ8_0,
        }[args.outtype]

    print(f"params = {params}")
@@ -1103,7 +1164,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        params.ftype = ftype
        print(f"Writing {outfile}, format {ftype}")

-        OutputFile.write_all(outfile, params, model, vocab)
+        OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency)
        print(f"Wrote {outfile}")


@@ -189,12 +189,19 @@ int main(int argc, char ** argv) {
        }
    }

-    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    // Add BOS if SPM tokenizer
+    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;

    // tokenize the prompt
    std::vector<llama_token> embd_inp;
+
+    if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+    }
+
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
    } else {
        embd_inp = session_tokens;
    }
@@ -210,9 +217,9 @@ int main(int argc, char ** argv) {
    int original_prompt_len = 0;
    if (ctx_guidance) {
        params.cfg_negative_prompt.insert(0, 1, ' ');
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
    }
@@ -259,7 +266,7 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);

    // in instruct mode, we inject a prefix and a suffix to each input by the user
@@ -597,7 +604,12 @@ int main(int argc, char ** argv) {
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
-                    logits[llama_token_nl(ctx)] = nl_logit;
+                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
+                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
+                            candidates_p.data[idx].logit = nl_logit;
+                            break;
+                        }
+                    }
                }

                if (grammar != NULL) {
@@ -351,6 +351,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

    // This is needed as usual for LLaMA models
    const bool add_bos = is_spm;
@@ -406,6 +407,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);

+    std::vector<std::vector<int>> ending_tokens(4);
+
    std::vector<float> tok_logits(n_vocab);

    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
@@ -413,11 +416,21 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
        size_t context_size = context_embd.size();

+        for (int i = 0; i < 4; ++i) {
+            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
+            for (int k = 0; k < int(context_size); ++k) {
+                if (ending_tokens[i][k] != context_embd[k]) {
+                    fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
+                    break;
+                }
+            }
+        }
+
        // Do the 1st ending
        // In this case we include the context when evaluating
-        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
+        //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
+        auto query_embd = ending_tokens[0];
        auto query_size = query_embd.size();
-        //printf("First query: %d\n",(int)query_size);

        // Stop if query wont fit the ctx window
        if (query_size > (size_t)params.n_ctx) {
@@ -462,7 +475,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {

            // Tokenize the query
-            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            query_embd.resize(ending_tokens[ending_idx].size() - context_size);
+            std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
            query_size = query_embd.size();

            // Stop if query wont fit the ctx window
@@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed.
 ```bash
 mkdir llama-client
 cd llama-client
-npm init
-npm install axios
 ```

 Create a index.js file and put inside this:

 ```javascript
-const axios = require("axios");
-
 const prompt = `Building a website can be done in 10 simple steps:`;

 async function Test() {
-    let result = await axios.post("http://127.0.0.1:8080/completion", {
-        prompt,
-        n_predict: 512,
-    });
-
-    // the response is received until completion finish
-    console.log(result.data.content);
+    let response = await fetch("http://127.0.0.1:8080/completion", {
+        method: 'POST',
+        body: JSON.stringify({
+            prompt,
+            n_predict: 512,
+        })
+    })
+    console.log((await response.json()).content)
 }

-Test();
+Test()
 ```

 And run it:

 ```bash
-node .
+node index.js
 ```

 ## API Endpoints
@@ -167,6 +164,12 @@ node .

    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.

+-   **POST** `/detokenize`: Convert tokens to text.
+
+    *Options:*
+
+    `tokens`: Set the tokens to detokenize.
+
 -   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

    *Options:*
@@ -1104,6 +1104,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
        {"tokens", tokens}};
 }

+static json format_detokenized_response(std::string content)
+{
+    return json{
+        {"content", content}};
+}
+
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value)
 {
@@ -1501,6 +1507,21 @@ int main(int argc, char **argv)
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json"); });

+    svr.Post("/detokenize", [&llama](const Request &req, Response &res)
+             {
+        auto lock = llama.lock();
+
+        const json body = json::parse(req.body);
+        std::string content;
+        if (body.count("tokens") != 0)
+        {
+            const std::vector<llama_token> tokens = body["tokens"];
+            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+        }
+
+        const json data = format_detokenized_response(content);
+        return res.set_content(data.dump(), "application/json"); });
+
    svr.Post("/embedding", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
@@ -5,11 +5,11 @@
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1685518550,
-        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
+        "lastModified": 1692799911,
+        "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
+        "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1685931219,
-        "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
+        "lastModified": 1692913444,
+        "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
+        "rev": "18324978d632ffc55ef1d928e81630c620f4f447",
        "type": "github"
      },
      "original": {
@@ -6,6 +6,9 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
+        name = "llama.cpp";
+        src = ./.;
+        meta.mainProgram = "llama";
        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        buildInputs = with pkgs; [ openmpi ];
        osSpecific = with pkgs; buildInputs ++
@@ -21,11 +24,17 @@
              CoreGraphics
              CoreVideo
            ]
+          else if isDarwin then
+            with pkgs.darwin.apple_sdk.frameworks; [
+              Accelerate
+              CoreGraphics
+              CoreVideo
+            ]
          else
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
+        nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        postPatch = ''
@@ -38,35 +47,35 @@
          mv $out/bin/server $out/bin/llama-server
        '';
        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
-      in {
+      in
+      {
        packages.default = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
-          src = ./.;
-          postPatch = postPatch;
-          nativeBuildInputs = nativeBuildInputs;
-          buildInputs = osSpecific;
+          inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
          cmakeFlags = cmakeFlags
            ++ (if isAarch64 && isDarwin then [
-              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-              "-DLLAMA_METAL=ON"
-            ] else [
-              "-DLLAMA_BLAS=ON"
-              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+            "-DLLAMA_METAL=ON"
+          ] else [
+            "-DLLAMA_BLAS=ON"
+            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
-          postInstall = postInstall;
-          meta.mainProgram = "llama";
        };
        packages.opencl = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
-          src = ./.;
-          postPatch = postPatch;
-          nativeBuildInputs = nativeBuildInputs;
+          inherit name src meta postPatch nativeBuildInputs postInstall;
          buildInputs = with pkgs; buildInputs ++ [ clblast ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_CLBLAST=ON"
          ];
-          postInstall = postInstall;
-          meta.mainProgram = "llama";
+        };
+        packages.rocm = pkgs.stdenv.mkDerivation {
+          inherit name src meta postPatch nativeBuildInputs postInstall;
+          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
+          cmakeFlags = cmakeFlags ++ [
+            "-DLLAMA_HIPBLAS=1"
+            "-DCMAKE_C_COMPILER=hipcc"
+            "-DCMAKE_CXX_COMPILER=hipcc"
+            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
+          ];
        };
        apps.llama-server = {
          type = "app";
@@ -80,8 +89,13 @@
          type = "app";
          program = "${self.packages.${system}.default}/bin/llama";
        };
+        apps.quantize = {
+          type = "app";
+          program = "${self.packages.${system}.default}/bin/quantize";
+        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
+          buildInputs = [ llama-python ];
          packages = nativeBuildInputs ++ osSpecific;
        };
      });
@@ -1,9 +1,6 @@
 // Defines fileno on msys:
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
 #endif

 #include "llama.h"
@@ -62,6 +59,9 @@
 #include <cinttypes>
 #include <climits>
 #include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
@@ -955,10 +955,10 @@ struct llama_vocab {
    id linefeed_id = 13;

    int find_bpe_rank(std::string token_left, std::string token_right) const {
-        replace_all(token_left,  " ",  "Ġ");
-        replace_all(token_left,  "\n", "Ċ");
-        replace_all(token_right, " ",  "Ġ");
-        replace_all(token_right, "\n", "Ċ");
+        replace_all(token_left,  " ",  "\u0120");
+        replace_all(token_left,  "\n", "\u010A");
+        replace_all(token_right, " ",  "\u0120");
+        replace_all(token_right, "\n", "\u010A");

        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
        if (it == bpe_ranks.end()) {
@@ -1635,7 +1635,7 @@ static void llm_load_hparams(
 }

 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos);

 static void llm_load_vocab(
        llama_model_loader & ml,
@@ -1737,7 +1737,7 @@ static void llm_load_vocab(
    }

    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
-    vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
+    vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];

    // special tokens
    GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
@@ -3027,14 +3027,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
 }

 static std::string llama_escape_whitespace(const std::string& text) {
-    std::string result = "\xe2\x96\x81";
-    for (size_t offs = 0; offs < text.length(); ++offs) {
-        if (text[offs] == ' ') {
-            result += "\xe2\x96\x81";
-        } else {
-            result += text[offs];
-        }
-    }
+    std::string result = text;
+    replace_all(result, " ", "\xe2\x96\x81");
    return result;
 }

@@ -3219,7 +3213,7 @@ struct llm_bigram_bpe {
 };

 struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
+    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}

    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        int final_prev_index = -1;
@@ -3371,8 +3365,6 @@ private:
        return words;
    }

-    bool flag_g2ws = false;
-
    const llama_vocab & vocab;

    std::vector<llm_symbol> symbols;
@@ -3381,39 +3373,26 @@ private:
    llm_bigram_bpe::queue work_queue;
 };

-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) {
    std::vector<llama_vocab::id> output;

    if (raw_text.empty()) {
        return output;
    }

+    if (bos && vocab.special_bos_id != -1) {
+        output.push_back(vocab.special_bos_id);
+    }
+
    switch (vocab.type) {
        case LLAMA_VOCAB_TYPE_SPM:
            {
                llm_tokenizer_spm tokenizer(vocab);
-
-                if (bos) {
-                    output.push_back(vocab.special_bos_id);
-                }
-
-                std::string text;
-                if (escape) {
-                    text = llama_escape_whitespace(raw_text);
-                } else {
-                    text = raw_text;
-                }
-
-                tokenizer.tokenize(text, output);
+                tokenizer.tokenize(llama_escape_whitespace(raw_text), output);
            } break;
        case LLAMA_VOCAB_TYPE_BPE:
            {
-                llm_tokenizer_bpe tokenizer(vocab, escape);
-
-                if (bos && vocab.special_bos_id != -1) {
-                    output.push_back(vocab.special_bos_id);
-                }
-
+                llm_tokenizer_bpe tokenizer(vocab);
                tokenizer.tokenize(raw_text, output);
            } break;
    };
@@ -3908,7 +3887,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *

    // Calculate absolute value of second derivatives
    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = abs(second_derivatives[i]);
+        second_derivatives[i] = std::abs(second_derivatives[i]);
    }

    // Normalize the second derivatives
@@ -4674,6 +4653,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));

+    llama_model model;
+    llm_load_arch(*ml, model);
+    llm_load_hparams(*ml, model, 0, 0, 0);
+
    const size_t align = GGUF_DEFAULT_ALIGNMENT;
    struct gguf_context * ctx_out = gguf_init_empty();

@@ -4699,6 +4682,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            ++n_feed_forward_w2;
        }
    }
+    if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
+        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
+                __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
+    }

    int i_attention_wv = 0;
    int i_feed_forward_w2 = 0;
@@ -4775,8 +4762,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

            if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
                int nx = tensor->ne[0];
-                int ny = tensor->ne[1];
-                if (nx % QK_K == 0 && ny % QK_K == 0) {
+                if (nx % QK_K == 0) {
                    new_type = GGML_TYPE_Q6_K;
                }
            } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -4790,6 +4776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
                else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
                        (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
+                if (model.type == MODEL_70B) {
+                    // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
+                    // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
+                    // nearly negligible increase in model size by quantizing this tensor with more bits:
+                    if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
+                }
                ++i_attention_wv;
            } else if (name.find("ffn_down.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -4819,8 +4811,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
                int nx = tensor->ne[0];
                int ny = tensor->ne[1];
-                if (nx % QK_K != 0 || ny % QK_K != 0) {
-                    LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
+                if (nx % QK_K != 0) {
+                    LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
                    convert_incompatible_tensor = true;
                }
            }
@@ -6095,8 +6087,7 @@ int llama_tokenize_with_model(
                 llama_token * tokens,
                         int   n_max_tokens,
                        bool   add_bos) {
-    auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
-    auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
+    auto res = llama_tokenize_internal(model->vocab, text, add_bos);

    if (n_max_tokens < (int) res.size()) {
        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -100,7 +100,8 @@ int main(int argc, char **argv) {
    bool success = true;

    for (const auto & test_kv : k_tests()) {
-        std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
        fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
Author	SHA1	Message	Date
Bruce MacDonald	c1ac54b77a	server : add `/detokenize` endpoint (#2802 ) * Add a /detokenize endpoint to the example server * remove trailing white-space	2023-08-27 07:11:45 +08:00
Kerfuffle	730d9c681e	convert.py : advanced option (#2753 ) * Allow convert.py to convert to q8_0 Fix issue with bounded_parallel_map and greedy consuming iterator Display elapsed time during conversion * Add --concurrency option Minor improvements to help text Clean up bounded_parallel_map function a bit * Massive speed improvement thanks to Cebtenzzre * Refactor types	2023-08-26 23:13:36 +03:00
Tim Miller	c7d92e6dfe	llama : use Unicode Escape Sequence to replace encoded characters (#2814 ) The use of special characters within source files can break compiling on some computers with different region and language settings. Using Unicode escape sequences should allow for the code to be compiled on all setups without needing to change your computers settings or switch regions.	2023-08-26 21:27:07 +03:00
Tungsten842	61d1a2895e	flake.nix : add rocm support and cleanup (#2808 )	2023-08-26 21:19:44 +03:00
Cebtenzzre	741ca7dd1c	llama : move #includes out of _GNU_SOURCE conditional (#2817 )	2023-08-26 21:17:51 +03:00
Dr. Tom Murphy VII Ph.D	72f895c923	main : fix bug (penalize_nl=false doesn't work) + suppress warning on mingw (#1528 ) * Fix bug in main.cpp where penalize_nl=false has no effect. It modifies the underlying logits array, but at this point we are already working on the candidates copy. * Suppress redefinition warning for NOMINMAX on mingw. In my installation, this macro is already defined by /usr/lib/gcc/x86_64-w64-mingw32/11/include/c++/x86_64-w64-mingw32/bits/os_defines.h:45. * main : fix indentation * main : pass ctx to llama_token_nl() --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-08-26 21:12:56 +03:00
Cebtenzzre	50526f37eb	llama : use std::abs in llama_sample_tail_free (#2800 ) Plain 'abs' casts the input to int.	2023-08-26 19:53:52 +03:00
Georgi Gerganov	04f4b1eb10	k-quants : remove unnecessary tensor shape restrictions (#2811 )	2023-08-26 17:37:35 +03:00
Kawrakow	7592375403	Better perplexity for 2- and 3-bit quantization for LLaMA-v2-70B (#2807 ) * Better perplexity for 2- and 3-bit quantization for the 70B model * PR comment --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-08-26 17:27:49 +03:00
Kawrakow	771551a793	Fix HellaSwag (#2805 ) Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-08-26 16:48:53 +03:00
Volodymyr Vitvitskyi	f305bad11e	flake : build llama.cpp on Intel with nix (#2795 ) Problem ------- `nix build` fails with missing `Accelerate.h`. Changes ------- - Fix build of the llama.cpp with nix for Intel: add the same SDK frameworks as for ARM - Add `quantize` app to the output of nix flake - Extend nix devShell with llama-python so we can use convertScript Testing ------- Testing the steps with nix: 1. `nix build` Get the model and then 2. `nix develop` and then `python convert.py models/llama-2-7b.ggmlv3.q4_0.bin` 3. `nix run llama.cpp#quantize -- open_llama_7b/ggml-model-f16.gguf ./models/ggml-model-q4_0.bin 2` 4. `nix run llama.cpp#llama -- -m models/ggml-model-q4_0.bin -p "What is nix?" -n 400 --temp 0.8 -e -t 8` Co-authored-by: Volodymyr Vitvitskyi <volodymyrvitvitskyi@SamsungPro.local>	2023-08-26 16:25:39 +03:00
Nigel Bosch	a2ca4e9de9	Handle null rope scaling value (#2793 )	2023-08-26 14:11:17 +02:00
klosax	2ba83c8685	Fix spm whitespaces (#2806 ) * llama.cpp : fix spm whitespace escaping + clean up * main.cpp : spm - add whitespace in front of prompt * test-tokenizer-0.cpp : spm - add whitespace in front of prompt	2023-08-26 13:45:53 +02:00
lon	bae5c5f679	examples : skip unnecessary external lib in server README.md how-to (#2804 )	2023-08-26 16:07:43 +08:00