llama : fix struct decl (#2790 )

Faster perplexity computation (#2786 )
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-06-14 01:36:47 +02:00 · 2023-08-25 19:17:15 +03:00 · 2023-08-25 19:05:02 +03:00 · 2023-08-25 18:18:48 +03:00 · 2023-08-25 16:41:52 +02:00 · 2023-08-25 15:16:19 +02:00
25 changed files with 3038 additions and 1271 deletions
@@ -0,0 +1,44 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
@@ -0,0 +1,44 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT [ "/app/main" ]
@@ -5,14 +5,7 @@
 .vscode/
 .DS_Store

-build/
-build-em/
-build-debug/
-build-release/
-build-static/
-build-no-accel/
-build-sanitize-addr/
-build-sanitize-thread/
+build*/

 models/*

@@ -16,20 +16,7 @@
 .vs/
 .vscode/

-build/
-build-em/
-build-debug/
-build-release/
-build-ci-debug/
-build-ci-release/
-build-static/
-build-cublas/
-build-opencl/
-build-metal/
-build-mpi/
-build-no-accel/
-build-sanitize-addr/
-build-sanitize-thread/
+build*/
 out/
 tmp/

@@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kern
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
+option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
@@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
    endif()
 endif()

+if (LLAMA_HIPBLAS)
+    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
+    endif()
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+    endif()
+
+    find_package(hip)
+    find_package(hipblas)
+    find_package(rocblas)
+
+    if (${hipblas_FOUND} AND ${hip_FOUND})
+        message(STATUS "HIP and hipBLAS found")
+        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
+        if (LLAMA_CUDA_FORCE_DMMV)
+            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
+        endif()
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
+        target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
+        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
+        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
+
+        if (LLAMA_STATIC)
+            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+        endif()
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
+    else()
+        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST

+ifdef LLAMA_HIPBLAS
+	ROCM_PATH	?= /opt/rocm
+	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+	LLAMA_CUDA_DMMV_X       ?= 32
+	LLAMA_CUDA_MMV_Y        ?= 1
+	LLAMA_CUDA_KQUANTS_ITER ?= 2
+	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	LDFLAGS		+= -lhipblas -lamdhip64 -lrocblas
+	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
+	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	HIPFLAGS    += -DCC_TURING=1000000000
+ifdef LLAMA_CUDA_FORCE_DMMV
+	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
+endif # LLAMA_CUDA_FORCE_DMMV
+	OBJS        += ggml-cuda.o
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # LLAMA_HIPBLAS
+
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL
@@ -422,6 +422,35 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

+- #### hipBLAS
+
+  This provide BLAS acceleation on HIP supported GPU like AMD GPU.
+  Make sure to have ROCm installed.
+  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+  Windows support is coming soon...
+
+  - Using `make`:
+    ```bash
+    make LLAMA_HIPBLAS=1
+    ```
+  - Using `CMake`:
+    ```bash
+    mkdir build
+    cd build
+    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
+    cmake --build .
+    ```
+
+  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
+  If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
+  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
+
+  | Option                  | Legal values           | Default | Description |
+  |-------------------------|------------------------|---------|-------------|
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+
 - #### CLBlast

  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
@@ -391,6 +391,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    ln -sfn ${mnt_models} ${SRC}/models-mnt

    python3 -m pip install -r ${SRC}/requirements.txt
+    python3 -m pip install --editable gguf-py
 fi

 ret=0
@@ -613,9 +613,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+#ifdef GGML_USE_CUBLAS
    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
-    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
+#endif // GGML_USE_CUBLAS
 #endif
    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
@@ -28,6 +28,7 @@ struct gpt_params {
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor

@@ -105,6 +105,7 @@ class Params:
    f_norm_eps: float

    f_rope_freq_base: Optional[float] = None
+    f_rope_scale: Optional[float] = None

    ftype: Optional[GGMLFileType] = None

@@ -160,13 +161,19 @@ class Params:
    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
        config = json.load(open(config_path))

-        n_vocab    = config["vocab_size"]
-        n_embd     = config["hidden_size"]
-        n_layer    = config["num_hidden_layers"]
-        n_ff       = config["intermediate_size"]
-        n_head     = config["num_attention_heads"]
-        n_head_kv  = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
-        f_norm_eps = config["rms_norm_eps"]
+        n_vocab          = config["vocab_size"]
+        n_embd           = config["hidden_size"]
+        n_layer          = config["num_hidden_layers"]
+        n_ff             = config["intermediate_size"]
+        n_head           = config["num_attention_heads"]
+        n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
+        f_norm_eps       = config["rms_norm_eps"]
+        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
+
+        if "rope_scaling" in config and config["rope_scaling"].get("type") == "linear":
+            f_rope_scale = config["rope_scaling"].get("factor")
+        else:
+            f_rope_scale = None

        n_mult = Params.find_n_mult(n_ff, n_embd)

@@ -179,15 +186,17 @@ class Params:
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

        return Params(
-            n_vocab    = n_vocab,
-            n_embd     = n_embd,
-            n_mult     = n_mult,
-            n_layer    = n_layer,
-            n_ctx      = n_ctx,
-            n_ff       = n_ff,
-            n_head     = n_head,
-            n_head_kv  = n_head_kv,
-            f_norm_eps = f_norm_eps,
+            n_vocab          = n_vocab,
+            n_embd           = n_embd,
+            n_mult           = n_mult,
+            n_layer          = n_layer,
+            n_ctx            = n_ctx,
+            n_ff             = n_ff,
+            n_head           = n_head,
+            n_head_kv        = n_head_kv,
+            f_norm_eps       = f_norm_eps,
+            f_rope_freq_base = f_rope_freq_base,
+            f_rope_scale     = f_rope_scale,
        )

    # LLaMA v2 70B params.json
@@ -771,6 +780,9 @@ class OutputFile:
        if params.f_rope_freq_base:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)

+        if params.f_rope_scale:
+            self.gguf.add_rope_scale_linear(params.f_rope_scale)
+
        if params.ftype:
            self.gguf.add_file_type(params.ftype)

@@ -25,6 +25,7 @@ else()
    add_subdirectory(simple)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
+    add_subdirectory(beam_search)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
@@ -0,0 +1,8 @@
+set(TARGET beam_search)
+add_executable(${TARGET} beam_search.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
@@ -0,0 +1,188 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <signal.h>
+#endif
+
+// Used for debugging to print out beam tokens.
+struct ostream_beam_view {
+    llama_context * ctx;
+    llama_beam_view beam_view;
+};
+std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
+    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
+    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
+        os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
+    }
+    return os << ')';
+}
+
+// Put here anything you want back in beam_search_callback().
+struct beam_search_callback_data {
+    llama_context * ctx;
+    std::vector<llama_token> response;
+};
+
+// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
+// For example, eob can be flagged due to maximum token length, stop words, etc.
+bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
+}
+
+// Function matching type llama_beam_search_callback_fn_t.
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
+void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
+    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        callback_data.response.resize(callback_data.response.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        std::copy(tokens, tokens + n, callback_data.response.end() - n);
+        printf("%lu", n);
+    }
+    fflush(stdout);
+#if 1 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
+
+int main(int argc, char ** argv)
+{
+    gpt_params params;
+    //params.n_gpu_layers = 200;
+
+    //---------------------------------
+    // Print help :
+    //---------------------------------
+
+    if ( argc < 2 || argv[1][0] == '-' )
+    {
+        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
+        return 1 ;
+    }
+
+    //---------------------------------
+    // Load parameters :
+    //---------------------------------
+
+    params.model = argv[1];
+
+    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
+
+    if ( argc > 3 )
+    {
+        params.prompt = argv[3];
+    }
+
+    if ( params.prompt.empty() )
+    {
+        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
+    }
+
+    //---------------------------------
+    // Init LLM :
+    //---------------------------------
+
+    llama_backend_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+
+    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+
+    if ( model == NULL )
+    {
+        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+        return 1;
+    }
+
+    //---------------------------------
+    // Tokenize the prompt :
+    //---------------------------------
+
+    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
+
+    const size_t max_context_size     = llama_n_ctx( ctx );
+    const size_t max_tokens_list_size = max_context_size - 4 ;
+
+    if (tokens_list.size() > max_tokens_list_size)
+    {
+        fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
+             __func__ , tokens_list.size() , max_tokens_list_size );
+        return 1;
+    }
+
+    fprintf( stderr, "\n\n" );
+
+    // Print the tokens from the prompt :
+
+    for( auto id : tokens_list )
+    {
+        std::cout << llama_token_to_str(ctx, id);
+    }
+    std::cout << std::flush;
+
+    int n_past = llama_get_kv_cache_token_count(ctx);
+    if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
+    {
+        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
+        return 1;
+    }
+    n_past += tokens_list.size();
+
+    beam_search_callback_data callback_data{ctx, {}};
+    size_t const beam_width = static_cast<size_t>(params.n_beams);
+    int const n_predict = 256;
+    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
+
+    std::cout << "\n\n";
+    for (llama_token const token_id : callback_data.response) {
+        std::cout << llama_token_to_str(ctx,token_id);
+    }
+    std::cout << std::endl;
+
+    llama_free( ctx );
+    llama_free_model( model );
+
+    llama_backend_free();
+
+    return 0;
+}
@@ -18,9 +18,7 @@
 #include "llama.h"
 #include "common.h"
 #include "build-info.h"
-#ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
-#endif

 // utils
 static uint64_t get_time_ns() {
@@ -443,6 +441,8 @@ struct test {
    static const std::string gpu_info;
    std::string model_filename;
    std::string model_type;
+    uint64_t model_size;
+    uint64_t model_n_params;
    int n_batch;
    int n_threads;
    bool f32_kv;
@@ -459,8 +459,10 @@ struct test {
    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
        model_filename = inst.model;
        char buf[128];
-        llama_model_type(lmodel, buf, sizeof(buf));
+        llama_model_desc(lmodel, buf, sizeof(buf));
        model_type = buf;
+        model_size = llama_model_size(lmodel);
+        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
        f32_kv = inst.f32_kv;
@@ -504,7 +506,7 @@ struct test {

    static std::string get_backend() {
        if (cuda) {
-            return "CUDA";
+            return GGML_CUDA_NAME;
        }
        if (opencl) {
            return "OpenCL";
@@ -526,7 +528,7 @@ struct test {
            "build_commit", "build_number",
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
-            "model_filename", "model_type",
+            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "f16_kv",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
            "n_prompt", "n_gen", "test_time",
@@ -540,6 +542,7 @@ struct test {

    static field_type get_field_type(const std::string & field) {
        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
+            field == "model_size" || field == "model_n_params" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
            field == "n_prompt" || field == "n_gen" ||
            field == "avg_ns" || field == "stddev_ns") {
@@ -575,7 +578,7 @@ struct test {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
-            model_filename, model_type,
+            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
@@ -711,8 +714,15 @@ struct markdown_printer : public printer {
            return -30;
        }
        if (field == "t/s") {
-            return 15;
+            return 16;
        }
+        if (field == "size" || field == "params") {
+            return 10;
+        }
+        if (field == "n_gpu_layers") {
+            return 3;
+        }
+
        int width = std::max((int)field.length(), 10);

        if (test::get_field_type(field) == test::STRING) {
@@ -721,9 +731,28 @@ struct markdown_printer : public printer {
        return width;
    }

+    static std::string get_field_display_name(const std::string & field) {
+        if (field == "n_gpu_layers") {
+            return "ngl";
+        }
+        if (field == "n_threads") {
+            return "threads";
+        }
+        if (field == "mul_mat_q") {
+            return "mmq";
+        }
+        if (field == "tensor_split") {
+            return "ts";
+        }
+        return field;
+    }
+
    void print_header(const cmd_params & params) override {
        // select fields to print
-        fields = { "model", "backend" };
+        fields.push_back("model");
+        fields.push_back("size");
+        fields.push_back("params");
+        fields.push_back("backend");
        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
        if (!is_cpu_backend) {
            fields.push_back("n_gpu_layers");
@@ -754,7 +783,7 @@ struct markdown_printer : public printer {

        fprintf(fout, "|");
        for (const auto & field : fields) {
-            fprintf(fout, " %*s |", get_field_width(field), field.c_str());
+            fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
        }
        fprintf(fout, "\n");
        fprintf(fout, "|");
@@ -771,12 +800,26 @@ struct markdown_printer : public printer {
        fprintf(fout, "|");
        for (const auto & field : fields) {
            std::string value;
+            char buf[128];
            if (field == "model") {
                value = t.model_type;
+            } else if (field == "size") {
+                if (t.model_size < 1024*1024*1024) {
+                    snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
+                } else {
+                    snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
+                }
+                value = buf;
+            } else if (field == "params") {
+                if (t.model_n_params < 1000*1000*1000) {
+                    snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
+                }
+                value = buf;
            } else if (field == "backend") {
                value = test::get_backend();
            } else if (field == "test") {
-                char buf[128];
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
                } else if (t.n_gen > 0 && t.n_prompt == 0) {
@@ -787,7 +830,6 @@ struct markdown_printer : public printer {
                }
                value = buf;
            } else if (field == "t/s") {
-                char buf[128];
                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                value = buf;
            } else if (vmap.find(field) != vmap.end()) {
@@ -6,6 +6,8 @@
 #include <ctime>
 #include <sstream>
 #include <cstring>
+#include <thread>
+#include <mutex>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -27,6 +29,40 @@ std::vector<float> softmax(const std::vector<float>& logits) {
    return probs;
 }

+float log_softmax(int n_vocab, const float * logits, int tok) {
+    float max_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
+    return logits[tok] - max_logit - log(sum_exp);
+}
+
+void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
+        double& nll, double& nll2) {
+
+    std::mutex mutex;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
+        double local_nll = 0, local_nll2 = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                nll += local_nll; nll2 += local_nll2;
+                break;
+            }
+            lock.unlock();
+            double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            local_nll += v;
+            local_nll2 += v*v;
+        }
+    };
+    for (auto& w : workers) w = std::thread(compute);
+    compute();
+    for (auto& w : workers) w.join();
+
+}
+
 void perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@@ -166,9 +202,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

    int count = 0;
    double nll = 0.0;
+    double nll2 = 0.0;

    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.n_ctx;
        const int end   = start + params.n_ctx;
@@ -228,26 +267,32 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
+        const int first = std::min(512, params.n_ctx/2);
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
+        count += params.n_ctx - first - 1;

-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
-
-            nll += -std::log(prob);
-            ++count;
-        }
        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
        } else {
-            printf("%8d  %.4lf\n", i*params.n_ctx, std::exp(nll / count));
+            double av = nll/count;
+            double av2 = nll2/count - av*av;
+            if (av2 > 0) av2 = sqrt(av2/(count-1));
+            printf("%8d  %.4lf  %4lf  %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
        }
        fflush(stdout);
    }
    printf("\n");
+    nll2 /= count;
+    nll /= count;
+    nll2 -= nll * nll;
+    if (nll2 > 0) {
+        nll2 = sqrt(nll2/(count-1));
+        double ppl = exp(nll);
+        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+    } else {
+        printf("Unexpected negative standard deviation of log(prob)\n");
+    }
 }

 std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
@@ -102,6 +102,17 @@
      padding: 0.5em;
    }

+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }

    textarea {
      padding: 5px;
@@ -133,11 +144,17 @@
      font-size: 80%;
      color: #888;
    }
+
+    @media (prefers-color-scheme: dark) {
+      .popover-content {
+        background-color: black;
+      }
+    }
  </style>

  <script type="module">
    import {
-      html, h, signal, effect, computed, render, useSignal, useEffect, useRef
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
    } from '/index.js';

    import { llama } from '/completion.js';
@@ -168,6 +185,7 @@
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
      grammar: '',
+      n_probs: 0, // no completion_probabilities
    })

    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@@ -334,10 +352,21 @@

      const prompt = template(session.value.template, {
        message: msg,
-        history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
      });

-      let currentMessage = '';
+      const currentMessages = [];
      const history = session.value.transcript

      const llamaParams = {
@@ -347,15 +376,19 @@

      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
        const data = chunk.data;
-        currentMessage += data.content;
-
-        // remove leading whitespace
-        currentMessage = currentMessage.replace(/^\s+/, "")
-
-        transcriptUpdate([...history, ["{{char}}", currentMessage]])

        if (data.stop) {
-          console.log("Completion finished: '", currentMessage, "', summary: ", data);
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
        }

        if (data.timings) {
@@ -420,8 +453,18 @@
        }
      }, [messages])

-      const chatLine = ([user, msg]) => {
-        return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data)
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          const text = isArrayMessage ?
+            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+            data;
+          message = html`<${Markdownish} text=${template(text)} />`
+        }
+        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
      };

      return html`
@@ -568,10 +611,71 @@
              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
            </fieldset>
+            <fieldset>
+              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
+            </fieldset>
          </details>
        </form>
      `
    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+              return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+                    padding: '0.3em',
+                    backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+                  }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+            })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
@@ -600,10 +704,121 @@
      `
    }

+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+                top: position.value.top,
+                left: position.value.left,
+              }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
    function App(props) {

      return html`
-        <div id="container">
+        <div>
          <header>
            <h1>llama.cpp</h1>
          </header>
@@ -624,11 +839,13 @@
      `;
    }

-    render(h(App), document.body);
+    render(h(App), document.querySelector('#container'));
  </script>
 </head>

 <body>
+  <div id="container"></div>
+  <div id="portal"></div>
 </body>

 </html>
@@ -124,8 +124,9 @@ static void server_log(const char *level, const char *function, int line,
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
    std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
-    // if first bit is 1, meaning it's a partial character
-    if (out.size() > 0 && (out[0] & 0x80) == 0x80)
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
@@ -1208,6 +1209,62 @@ static void log_server_request(const Request &req, const Response &res)
                           });
 }

+bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
+}
+
+// Function matching type llama_beam_search_callback_fn_t.
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
+void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
+    auto & llama = *static_cast<llama_server_context*>(callback_data);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
+        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
+        printf("%lu", n);
+    }
+    fflush(stdout);
+#if 0 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams:\n";
+    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
+
+struct token_translator {
+    llama_context * ctx;
+    std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
+    std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
+};
+
+void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
+    auto & gtps = llama.generated_token_probs;
+    auto translator = token_translator{llama.ctx};
+    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
+    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
+    if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
+        llama.generated_text.reserve(llama.generated_text.size() + len);
+    }
+    for (const completion_token_output & cto : gtps) {
+        llama.generated_text += translator(cto);
+    }
+}
+
 int main(int argc, char **argv)
 {
    // own arguments required by this example
@@ -1290,22 +1347,30 @@ int main(int argc, char **argv)
        llama.beginCompletion();

        if (!llama.stream) {
-            size_t stop_pos = std::string::npos;
+            if (llama.params.n_beams) {
+                // Fill llama.generated_token_probs vector with final beam.
+                llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
+                                  llama.n_past, llama.n_remain, llama.params.n_threads);
+                // Translate llama.generated_token_probs to llama.generated_text.
+                append_to_generated_text_from_generated_token_probs(llama);
+            } else {
+                size_t stop_pos = std::string::npos;

-            while (llama.has_next_token) {
-                const completion_token_output token_with_probs = llama.doCompletion();
-                const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
+                while (llama.has_next_token) {
+                    const completion_token_output token_with_probs = llama.doCompletion();
+                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);

-                stop_pos = llama.findStoppingStrings(llama.generated_text,
-                    token_text.size(), STOP_FULL);
-            }
+                    stop_pos = llama.findStoppingStrings(llama.generated_text,
+                        token_text.size(), STOP_FULL);
+                }

-            if (stop_pos == std::string::npos) {
-                stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
-            }
-            if (stop_pos != std::string::npos) {
-                llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
-                    llama.generated_text.end());
+                if (stop_pos == std::string::npos) {
+                    stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+                }
+                if (stop_pos != std::string::npos) {
+                    llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
+                        llama.generated_text.end());
+                }
            }

            const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
@@ -1321,59 +1386,86 @@ int main(int argc, char **argv)

                while (llama.has_next_token) {
                    const completion_token_output token_with_probs = llama.doCompletion();
-                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
-                    if (llama.multibyte_pending > 0) {
+                    if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
                        continue;
                    }
+                    const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);

                    size_t pos = std::min(sent_count, llama.generated_text.size());

                    const std::string str_test = llama.generated_text.substr(pos);
+                    bool is_stop_full = false;
                    size_t stop_pos =
                        llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
                    if (stop_pos != std::string::npos) {
+                        is_stop_full = true;
                        llama.generated_text.erase(
                            llama.generated_text.begin() + pos + stop_pos,
                            llama.generated_text.end());
                        pos = std::min(sent_count, llama.generated_text.size());
                    } else {
+                        is_stop_full = false;
                        stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
                            STOP_PARTIAL);
                    }

-                    const std::string to_send = llama.generated_text.substr(pos, stop_pos);
-                    sent_count += to_send.size();
+                    if (
+                        stop_pos == std::string::npos ||
+                        // Send rest of the text if we are at the end of the generation
+                        (!llama.has_next_token && !is_stop_full && stop_pos > 0)
+                    ) {
+                        const std::string to_send = llama.generated_text.substr(pos, std::string::npos);

-                    std::vector<completion_token_output> probs_output = {};
+                        sent_count += to_send.size();

-                    if (llama.params.n_probs > 0) {
-                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
-                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
-                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
-                        if (probs_pos < probs_stop_pos) {
-                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                        std::vector<completion_token_output> probs_output = {};
+
+                        if (llama.params.n_probs > 0) {
+                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
+                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
+                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
+                            if (probs_pos < probs_stop_pos) {
+                                probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                            }
+                            sent_token_probs_index = probs_stop_pos;
+                        }
+
+                        const json data = format_partial_response(llama, to_send, probs_output);
+
+                        const std::string str =
+                            "data: " +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            "\n\n";
+
+                        LOG_VERBOSE("data stream", {
+                            { "to_send", str }
+                        });
+
+                        if (!sink.write(str.data(), str.size())) {
+                            LOG_VERBOSE("stream closed", {});
+                            llama_print_timings(llama.ctx);
+                            return false;
                        }
-                        sent_token_probs_index = probs_stop_pos;
                    }

-                    const json data = llama.has_next_token
-                                          ? format_partial_response(llama, to_send, probs_output)
-                                          // Generation is done, send extra information.
-                                          : format_final_response(llama, to_send, llama.generated_token_probs);
+                    if (!llama.has_next_token) {
+                        // Generation is done, send extra information.
+                        const json data = format_final_response(llama, "", llama.generated_token_probs);

-                    const std::string str =
-                        "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
+                        const std::string str =
+                            "data: " +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            "\n\n";

-                    LOG_VERBOSE("data stream", {
-                        { "to_send", str }
-                    });
+                        LOG_VERBOSE("data stream", {
+                            { "to_send", str }
+                        });

-                    if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
-                        return false;
+                        if (!sink.write(str.data(), str.size())) {
+                            LOG_VERBOSE("stream closed", {});
+                            llama_print_timings(llama.ctx);
+                            return false;
+                        }
                    }
                }

@@ -6,15 +6,116 @@
 #include <atomic>
 #include <assert.h>

+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#else
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
+#endif

 #include "ggml-cuda.h"
 #include "ggml.h"

 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#ifndef CC_TURING
 #define CC_TURING   700
+#endif
+
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int&>(c);
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(__gfx1100__)
+    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+    int tmp1;
+    int tmp2;
+    asm("\n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        "
+        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+        : "v"(a), "v"(b)
+    );
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+#endif

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -424,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_1 * x = (const block_q4_1 *) vx;

-    const dfloat d = x[ib].dm.x;
-    const dfloat m = x[ib].dm.y;
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);

    const int vui = x[ib].qs[iqs];

@@ -467,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_1 * x = (const block_q5_1 *) vx;

-    const dfloat d = x[ib].dm.x;
-    const dfloat m = x[ib].dm.y;
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);

    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -520,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
    const uint8_t q = x[i].qs[32*n + l];
    float * y = yy + i*QK_K + 128*n;

-    float dall = x[i].dm.x;
-    float dmin = x[i].dm.y;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -531,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
    const int il = tid%16;  // 0...15
    const uint8_t q = x[i].qs[il] >> (2*is);
    float * y = yy + i*QK_K + 16*is + il;
-    float dall = x[i].dm.x;
-    float dmin = x[i].dm.y;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
 #endif
@@ -618,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float

    float * y = yy + i*QK_K + 64*il + n*ir;

-    const float dall = x[i].dm.x;
-    const float dmin = x[i].dm.y;
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);

    const uint8_t * q = x[i].qs + 32*il + n*ir;

@@ -657,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float

    float * y = yy + i*QK_K + 64*il + 2*ir;

-    const float dall = x[i].dm.x;
-    const float dmin = x[i].dm.y;
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);

    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
    const uint8_t * qh = x[i].qh + 2*ir;
@@ -770,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
        const float   * y = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;

-        const float dall = x[i].dm.x;
-        const float dmin = x[i].dm.y;
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);

        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
        aux[0] = a[0] & 0x0f0f0f0f;
@@ -991,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
        const float   * y1 = yy + i*QK_K + y_offset;
        const float   * y2 = y1 + 128;

-        const float dall = x[i].dm.x;
-        const float dmin = x[i].dm.y;
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);

        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
@@ -1124,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
        const float   * y1  = yy + i*QK_K + y_offset;
        const float   * y2  = y1 + 128;

-        const float dall = x[i].dm.x;
-        const float dmin = x[i].dm.y;
+        const float dall = __low2half(x[i].dm);
+        const float dmin = __high2half(x[i].dm);

        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
@@ -1348,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
        return;
    }

-    y[ib].ds.x = d;
-    y[ib].ds.y = sum;
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }

 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -2346,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
    }

-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
 }

 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2432,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR2_K; ++ i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
    }

    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2551,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR3_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
    }

    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2720,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(

    for (int i = 0; i < QR4_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2half(bq8i->ds);

        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
@@ -2747,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    const float dall = bq4_K->d[0];
    const float dmin = bq4_K->d[1];

-    const float d8_1 = bq8_1[0].ds.x;
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_1 = __low2float(bq8_1[0].ds);
+    const float d8_2 = __low2float(bq8_1[1].ds);

    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2901,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR5_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2float(bq8i->ds);

        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
@@ -2919,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(

    const float d = bq5_K->d;

-    const float d8_1 = bq8_1[0].ds.x;
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_1 = __low2half(bq8_1[0].ds);
+    const float d8_2 = __low2half(bq8_1[1].ds);

    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -3075,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR6_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
    }

    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3243,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
                    *dsi_dst = *dsi_src;
                } else {
                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src).x;
+                    *dfi_dst = __low2half(*dsi_src);
                }
            }

@@ -4944,10 +5045,18 @@ void ggml_init_cublas() {
    static bool initialized = false;

    if (!initialized) {
+
+#ifdef __HIP_PLATFORM_AMD__
+        // Workaround for a rocBLAS bug when using multiple graphics cards:
+        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+        rocblas_initialize();
+        CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
        CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
-        fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
        for (int id = 0; id < g_device_count; ++id) {
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -2,6 +2,14 @@

 #include "ggml.h"

+#ifdef GGML_USE_HIPBLAS
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -1 +1 @@
-from .gguf import GGUFWriter
+from .gguf import *
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.2.0"
+version = "0.2.1"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
@@ -1836,7 +1836,7 @@ static void llm_load_tensors(
    (void) main_gpu;
    (void) mul_mat_q;
 #if defined(GGML_USE_CUBLAS)
-    LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
+    LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
    ggml_cuda_set_main_device(main_gpu);
    ggml_cuda_set_mul_mat_q(mul_mat_q);
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
@@ -4326,6 +4326,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }

+//
+// Beam search
+//
+
+struct llama_beam {
+    std::vector<llama_token> tokens;
+    float p;  // Cumulative beam probability (renormalized relative to all beams)
+    bool eob; // Initialize end-of-beam to false. Callback sets this to true.
+    // Sort beams by probability. In case of ties, prefer beams at eob.
+    bool operator<(const llama_beam & rhs) const {
+        return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
+    }
+    // Shift off first n tokens and discard them.
+    void shift_tokens(const size_t n) {
+        if (n) {
+            std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
+            tokens.resize(tokens.size() - n);
+        }
+    }
+    llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
+};
+
+// A struct for calculating logit-related info.
+struct llama_logit_info {
+    const float * const logits;
+    const int n_vocab;
+    const float max_l;
+    const float normalizer;
+    struct sum_exp {
+        float max_l;
+        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+    };
+    llama_logit_info(llama_context * ctx)
+      : logits(llama_get_logits(ctx))
+      , n_vocab(llama_n_vocab(ctx))
+      , max_l(*std::max_element(logits, logits + n_vocab))
+      , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
+      { }
+    llama_token_data get_token_data(const llama_token token_id) const {
+        constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
+        return {token_id, logits[token_id], p};
+    }
+    // Return top k token_data by logit.
+    std::vector<llama_token_data> top_k(size_t k) {
+        std::vector<llama_token_data> min_heap;  // min-heap by logit
+        const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
+        min_heap.reserve(k_min);
+        for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
+            min_heap.push_back(get_token_data(token_id));
+        }
+        auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
+        std::make_heap(min_heap.begin(), min_heap.end(), comp);
+        for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
+            if (min_heap.front().logit < logits[token_id]) {
+                std::pop_heap(min_heap.begin(), min_heap.end(), comp);
+                min_heap.back().id = token_id;
+                min_heap.back().logit = logits[token_id];
+                std::push_heap(min_heap.begin(), min_heap.end(), comp);
+            }
+        }
+        return min_heap;
+    }
+    float probability_from_logit(float logit) {
+        return normalizer * std::exp(logit - max_l);
+    }
+};
+
+struct llama_beam_search_data {
+    llama_context * ctx;
+    size_t n_beams;
+    int n_past;
+    int n_predict;
+    int n_threads;
+    std::vector<llama_beam> beams;
+    std::vector<llama_beam> next_beams;
+
+    // Re-calculated on each loop iteration
+    size_t common_prefix_length;
+
+    // Used to communicate to/from callback on beams state.
+    std::vector<llama_beam_view> beam_views;
+
+    llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
+      : ctx(ctx)
+      , n_beams(n_beams)
+      , n_past(n_past)
+      , n_predict(n_predict)
+      , n_threads(n_threads)
+      , beam_views(n_beams) {
+        beams.reserve(n_beams);
+        next_beams.reserve(n_beams);
+    }
+
+    // Collapse beams to a single beam given by index.
+    void collapse_beams(const size_t beam_idx) {
+        if (0u < beam_idx) {
+            std::swap(beams[0], beams[beam_idx]);
+        }
+        beams.resize(1);
+    }
+
+    // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
+    // The repetative patterns below reflect the 2 stages of heaps:
+    //  * Gather elements until the vector is full, then call std::make_heap() on it.
+    //  * If the heap is full and a new element is found that should be included, pop the
+    //    least element to the back(), replace it with the new, then push it into the heap.
+    void fill_next_beams_by_top_probabilities(llama_beam & beam) {
+        // Min-heaps use a greater-than comparator.
+        const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
+        if (beam.eob) {
+            // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
+            if (next_beams.size() < n_beams) {
+                next_beams.push_back(std::move(beam));
+                if (next_beams.size() == n_beams) {
+                    std::make_heap(next_beams.begin(), next_beams.end(), comp);
+                }
+            } else if (next_beams.front().p < beam.p) {
+                std::pop_heap(next_beams.begin(), next_beams.end(), comp);
+                next_beams.back() = std::move(beam);
+                std::push_heap(next_beams.begin(), next_beams.end(), comp);
+            }
+        } else {
+            // beam is not at end-of-sentence, so branch with next top_k tokens.
+            if (!beam.tokens.empty()) {
+                llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
+            }
+            llama_logit_info logit_info(ctx);
+            std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
+            size_t i=0;
+            if (next_beams.size() < n_beams) {
+                for (; next_beams.size() < n_beams ; ++i) {
+                    llama_beam next_beam = beam;
+                    next_beam.tokens.push_back(next_tokens[i].id);
+                    next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
+                    next_beams.push_back(std::move(next_beam));
+                }
+                std::make_heap(next_beams.begin(), next_beams.end(), comp);
+            } else {
+                for (; next_beams.front().p == 0.0f ; ++i) {
+                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
+                    next_beams.back() = beam;
+                    next_beams.back().tokens.push_back(next_tokens[i].id);
+                    next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
+                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
+                }
+            }
+            for (; i < n_beams ; ++i) {
+                const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
+                if (next_beams.front().p < next_p) {
+                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
+                    next_beams.back() = beam;
+                    next_beams.back().tokens.push_back(next_tokens[i].id);
+                    next_beams.back().p = next_p;
+                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
+                }
+            }
+        }
+    }
+
+    // Find common_prefix_length based on beams.
+    // Requires beams is not empty.
+    size_t find_common_prefix_length() {
+        size_t common_prefix_length = beams[0].tokens.size();
+        for (size_t i = 1 ; i < beams.size() ; ++i) {
+            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
+            for (size_t j = 0 ; j < common_prefix_length ; ++j) {
+                if (beams[0].tokens[j] != beams[i].tokens[j]) {
+                    common_prefix_length = j;
+                    break;
+                }
+            }
+        }
+        return common_prefix_length;
+    }
+
+    // Construct beams_state to send back to caller via the callback function.
+    // Side effect: set common_prefix_length = find_common_prefix_length();
+    llama_beams_state get_beams_state(const bool last_call) {
+        for (size_t i = 0 ; i < beams.size() ; ++i) {
+            beam_views[i] = beams[i].view();
+        }
+        common_prefix_length = find_common_prefix_length();
+        return {beam_views.data(), beams.size(), common_prefix_length, last_call};
+    }
+
+    // Loop:
+    //  * while i < n_predict, AND
+    //  * any of the beams have not yet reached end-of-beam (eob), AND
+    //  * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
+    //    (since all other beam probabilities can only decrease)
+    void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
+        beams.push_back({{}, 1.0f, false});  // Start with one empty beam w/ probability = 1.0 and !eob.
+        const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
+        for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
+                       !beams[top_beam_index()].eob ; ++i) {
+            callback(callback_data, get_beams_state(false));  // Sets common_prefix_length
+            update_beams_from_beam_views();   // Update values (p,eob) that callback may have changed.
+            if (common_prefix_length) {
+                llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
+                n_past += common_prefix_length;
+            }
+            // Zero-out next_beam probabilities to place them last in following min-heap.
+            std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
+            for (llama_beam & beam : beams) {
+                beam.shift_tokens(common_prefix_length);
+                fill_next_beams_by_top_probabilities(beam);
+            }
+            // next_beams become the beams of next/final iteration. Swap them to re-use memory.
+            beams.swap(next_beams);
+            renormalize_beam_probabilities(beams);
+        }
+        collapse_beams(top_beam_index());
+        callback(callback_data, get_beams_state(true));
+    }
+
+    // As beams grow, the cumulative probabilities decrease.
+    // Renormalize them to avoid floating point underflow.
+    static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
+        const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
+        const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
+        std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
+    }
+
+    // Assumes beams is non-empty.  Uses llama_beam::operator<() for ordering.
+    size_t top_beam_index() {
+        return std::max_element(beams.begin(), beams.end()) - beams.begin();
+    }
+
+    // Copy (p,eob) for each beam which may have been changed by the callback.
+    void update_beams_from_beam_views() {
+        for (size_t i = 0 ; i < beams.size() ; ++i) {
+            beams[i].p = beam_views[i].p;
+            beams[i].eob = beam_views[i].eob;
+        }
+    }
+};
+
+void llama_beam_search(llama_context * ctx,
+                       llama_beam_search_callback_fn_t callback, void * callback_data,
+                       size_t n_beams, int n_past, int n_predict, int n_threads) {
+    assert(ctx);
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
+
+    beam_search_data.loop(callback, callback_data);
+
+    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    ctx->n_sample++;
+}
+
 //
 // quantization
 //
@@ -5297,13 +5548,29 @@ int llama_model_n_embd(const struct llama_model * model) {
    return model->hparams.n_embd;
 }

-int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
+int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
    return snprintf(buf, buf_size, "%s %s %s",
            model->name.c_str(),
            llama_model_type_name(model->type),
            llama_model_ftype_name(model->ftype).c_str());
 }

+uint64_t llama_model_size(const struct llama_model * model) {
+    uint64_t size = 0;
+    for (const auto & it : model->tensors_by_name) {
+        size += ggml_nbytes(it.second);
+    }
+    return size;
+}
+
+uint64_t llama_model_n_params(const struct llama_model * model) {
+    uint64_t nparams = 0;
+    for (const auto & it : model->tensors_by_name) {
+        nparams += ggml_nelements(it.second);
+    }
+    return nparams;
+}
+
 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
@@ -254,7 +254,11 @@ extern "C" {
    LLAMA_API int llama_model_n_embd (const struct llama_model * model);

    // Get a string describing the model type
-    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    // Returns the total size of all the tensors in the model in bytes
+    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+    // Returns the total number of parameters in the model
+    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);

    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
@@ -465,6 +469,43 @@ extern "C" {
    /// @details Accepts the sampled token into the grammar
    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);

+    //
+    // Beam search
+    //
+
+    struct llama_beam_view {
+        const llama_token * tokens;
+        size_t n_tokens;
+        float p;   // Cumulative beam probability (renormalized relative to all beams)
+        bool eob;  // Callback should set this to true when a beam is at end-of-beam.
+    };
+
+    // Passed to beam_search_callback function.
+    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+    // These pointers are valid only during the synchronous callback, so should not be saved.
+    struct llama_beams_state {
+        struct llama_beam_view * beam_views;
+        size_t n_beams;               // Number of elements in beam_views[].
+        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
+        bool last_call;               // True iff this is the last callback invocation.
+    };
+
+    // Type of pointer to the beam_search_callback function.
+    // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
+    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
+    typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
+
+    /// @details Deterministically returns entire sentence constructed by a beam search.
+    /// @param ctx Pointer to the llama_context.
+    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
+    /// @param callback_data A pointer that is simply passed back to callback.
+    /// @param n_beams Number of beams to use.
+    /// @param n_past Number of tokens already evaluated.
+    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
+    /// @param n_threads Number of threads as passed to llama_eval().
+    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
+
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
Author	SHA1	Message	Date
Marcus Dunn	232caf3c15	llama : fix struct decl (#2790 )	2023-08-25 19:17:15 +03:00
Kawrakow	d046dcee08	Faster perplexity computation (#2786 ) Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-08-25 19:05:02 +03:00
Matt Pulver	c82742ac9c	llama : add llama_beam_search() (#2267 ) * Add llama_beam_search(). * Add '// Beam search' heading to llama.{h,cpp} after llama_grammar_accept_token(). * Add space around * pointers and & references. * Add spaces around comparison and assignment operators. * Prefer west const. * Use llama_ prefix for structs in global namespace. * Delete obsolete comment from an earlier revision. * Change eos to eob in llama_beam and llama_beam_view structs.	2023-08-25 18:18:48 +03:00
Nigel Bosch	28b2c996ca	convert.py : Get rope scale from HuggingFace models (#2772 ) * Get rope scale from HF models * Save rope scale only for linear scaling * Rewrite for clarity	2023-08-25 16:41:52 +02:00
slaren	154725c543	llama-bench : add model sizes (#2771 ) * llama-bench : add model sizes * more compact markdown output * back to GiB * adjust column sizes	2023-08-25 15:16:19 +02:00
slaren	12e2e33a97	convert.py : export rope freq_base when converting CodeLlama from an HF model (#2773 )	2023-08-25 14:08:53 +02:00
Jhen-Jie Hong	29674ab4e8	server : display token probabilities in the UI (#2489 ) * server : add n_probs param in chat UI * server : keep message data array & show in probabilites component * server : add simple popover component * server : fix completion_probabilities undefined if not set n_probs * server : implement Probabilites * server : handle bytes * server : make n_probs max to 10 for easy scroll * server : adjust for dark/light mode * server : Fix regenerated prompt * server : update index.html.hpp * server : convert prob to percentage + show original value as div title * server : fix Probabilites not used if included empty str * server : skip byte pair in display probabilites * server : remove array check of completion_probabilities in messages * skip empty array or byte pair (> 1) in Probabilites * generate index.html.hpp * fix incorrect prob convert if the str is already a known token * use final response to show probabilities on stop * revert unnecessary change * correct probabilites usage * remove unused function * always send partial response for get correct probs of last to_send * fix typo * fix content of format_final_response * refactor probs render & make pColor transparent if not found * send empty string when got stop_pos in partial * avoid unnecessary empty data event & send rest of partial tokens on stop * use <br /> for new line * skip -1 tok in loop to avoid send '' on end * trim last new lines on stop * revert unnecessary change	2023-08-25 18:32:45 +08:00
Georgi Gerganov	5439a0ab57	ci : pip install gguf in editable mode (#2782 ) ggml-ci	2023-08-25 13:03:25 +03:00
M. Yusuf Sarıgöz	8194cd8772	gguf : export objects to user code (#2780 ) * gguf export more objects to user code * gguf export all objects to user code for now * gguf : bump version	2023-08-25 12:43:41 +03:00
Henri Vasserman	6bbc598a63	ROCm Port (#1087 ) * use hipblas based on cublas * Update Makefile for the Cuda kernels * Expand arch list and make it overrideable * Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5) * add hipBLAS to README * new build arg LLAMA_CUDA_MMQ_Y * fix half2 decomposition * Add intrinsics polyfills for AMD * AMD assembly optimized __dp4a * Allow overriding CC_TURING * use "ROCm" instead of "CUDA" * ignore all build dirs * Add Dockerfiles * fix llama-bench * fix -nommq help for non CUDA/HIP --------- Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com> Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com> Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Co-authored-by: jammm <2500920+jammm@users.noreply.github.com> Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com>	2023-08-25 12:09:42 +03:00