speculative : add --draft CLI arg

speculative : print encoding speed
speculative : initial example
2023-09-03 13:51:07 +03:00 · 2023-09-03 13:45:13 +03:00 · 2023-09-03 13:44:31 +03:00
27 changed files with 2412 additions and 2765 deletions
@@ -17,6 +17,3 @@ indent_style = tab

 [prompts/*.txt]
 insert_final_newline = unset
-
-[examples/server/public/*]
-indent_size = 2
@@ -31,29 +31,27 @@ tmp/
 models/*
 models-mnt

-/Pipfile
-/baby-llama
-/beam-search
-/benchmark-matmult
-/convert-llama2c-to-ggml
-/embd-input-test
+/main
+/quantize
+/quantize-stats
+/result
+/perplexity
 /embedding
+/train-text-from-scratch
+/convert-llama2c-to-ggml
+/simple
+/benchmark-matmult
+/vdot
+/server
+/Pipfile
+/embd-input-test
 /gguf
 /gguf-llama-simple
 /libllama.so
 /llama-bench
-/main
-/metal
-/perplexity
-/quantize
-/quantize-stats
-/result
+/baby-llama
+/beam-search
 /save-load-state
-/server
-/simple
-/speculative
-/train-text-from-scratch
-/vdot
 build-info.h
 arm_neon.h
 compile_commands.json
@@ -36,12 +36,6 @@ endif()
 # Option list
 #

-if (APPLE)
-    set(LLAMA_METAL_DEFAULT ON)
-else()
-    set(LLAMA_METAL_DEFAULT OFF)
-endif()
-
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
@@ -82,7 +76,7 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
-option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
+option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -164,31 +158,6 @@ if (APPLE AND LLAMA_ACCELERATE)
    endif()
 endif()

-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-
-    message(STATUS "Metal framework found")
-
-    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
-
-    add_compile_definitions(GGML_USE_METAL)
-    #add_compile_definitions(GGML_METAL_NDEBUG)
-
-    # get full path to the file
-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
-
-    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
-
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        )
-endif()
-
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@@ -324,6 +293,29 @@ if (LLAMA_CUBLAS)
    endif()
 endif()

+if (LLAMA_METAL)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+
+    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
+
+    add_compile_definitions(GGML_USE_METAL)
+    #add_compile_definitions(GGML_METAL_NDEBUG)
+
+    # get full path to the file
+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # copy ggml-metal.metal to bin directory
+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        )
+endif()
+
 if (LLAMA_MPI)
    cmake_minimum_required(VERSION 3.10)
    find_package(MPI)
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
@@ -7,39 +7,6 @@ TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-dou
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report

-ifndef UNAME_S
-UNAME_S := $(shell uname -s)
-endif
-
-ifndef UNAME_P
-UNAME_P := $(shell uname -p)
-endif
-
-ifndef UNAME_M
-UNAME_M := $(shell uname -m)
-endif
-
-# Mac OS + Arm can report x86_64
-# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
-ifeq ($(UNAME_S),Darwin)
-	ifndef LLAMA_NO_METAL
-		LLAMA_METAL := 1
-	endif
-
-	ifneq ($(UNAME_P),arm)
-		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
-		ifeq ($(SYSCTL_M),1)
-			# UNAME_P := arm
-			# UNAME_M := arm64
-			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
-		endif
-	endif
-endif
-
-ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
-BUILD_TARGETS += metal
-endif
-
 default: $(BUILD_TARGETS)

 test:
@@ -71,6 +38,18 @@ gcovr-report: coverage ## Generate gcovr report
 	mkdir -p gcovr-report
 	gcovr --root . --html --html-details --output gcovr-report/coverage.html

+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
+endif
+
+ifndef UNAME_P
+UNAME_P := $(shell uname -p)
+endif
+
+ifndef UNAME_M
+UNAME_M := $(shell uname -m)
+endif
+
 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
@@ -79,6 +58,19 @@ endif
 CCV := $(shell $(CC) --version | head -n 1)
 CXXV := $(shell $(CXX) --version | head -n 1)

+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+ifeq ($(UNAME_S),Darwin)
+	ifneq ($(UNAME_P),arm)
+		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
+		ifeq ($(SYSCTL_M),1)
+			# UNAME_P := arm
+			# UNAME_M := arm64
+			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
+		endif
+	endif
+endif
+
 #
 # Compile flags
 #
@@ -239,24 +231,14 @@ endif
 endif

 ifndef LLAMA_NO_ACCELERATE
-	# Mac OS - include Accelerate framework.
-	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
+	# Mac M1 - include Accelerate framework.
+	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
 		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
 		MK_LDFLAGS  += -framework Accelerate
 	endif
 endif # LLAMA_NO_ACCELERATE

-ifdef LLAMA_METAL
-	# By default - use GPU acceleration on Mac OS
-	ifeq ($(UNAME_S),Darwin)
-		CFLAGS   += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
-		CXXFLAGS += -DGGML_USE_METAL
-		LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-		OBJS     += ggml-metal.o
-	endif
-endif # LLAMA_METAL
-
 ifdef LLAMA_MPI
 	MK_CPPFLAGS += -DGGML_USE_MPI
 	MK_CFLAGS   += -Wno-cast-qual
@@ -495,8 +477,9 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
 beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
+BUILD_TARGETS += metal
+endif

 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
@@ -280,11 +280,29 @@ In order to build llama.cpp you have three different options.

 ### Metal Build

-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
+Using Metal allows the computation to be executed on the GPU for Apple devices:

-When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
-argument.
+- Using `make`:
+
+  ```bash
+  LLAMA_METAL=1 make
+  ```
+
+- Using `CMake`:
+
+    ```bash
+    mkdir build-metal
+    cd build-metal
+    cmake -DLLAMA_METAL=ON ..
+    cmake --build . --config Release
+    ```
+
+When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
+Any value larger than 0 will offload the computation to the GPU. For example:
+
+```bash
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
+```

 ### MPI Build

@@ -584,109 +584,109 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }

 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  -i, --interactive     run in interactive mode\n");
-    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
-    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    printf("  -r PROMPT, --reverse-prompt PROMPT\n");
-    printf("                        halt generation at PROMPT, return control in interactive mode\n");
-    printf("                        (can be specified more than once for multiple prompts).\n");
-    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
-    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -p PROMPT, --prompt PROMPT\n");
-    printf("                        prompt to start generation with (default: empty)\n");
-    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    printf("  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
-    printf("  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
-    printf("                        not supported with --interactive or other interactive options\n");
-    printf("  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
-    printf("  --random-prompt       start with a randomized prompt.\n");
-    printf("  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
-    printf("  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    printf("  -f FNAME, --file FNAME\n");
-    printf("                        prompt file to start generation.\n");
-    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
-    printf("  --mirostat N          use Mirostat sampling.\n");
-    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    printf("                        modifies the likelihood of token appearing in the completion,\n");
-    printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
-    printf("  --grammar-file FNAME  file to read grammar from\n");
-    printf("  --cfg-negative-prompt PROMPT\n");
-    printf("                        negative prompt to use for guidance. (default: empty)\n");
-    printf("  --cfg-negative-prompt-file FNAME\n");
-    printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
-    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
-    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    printf("  --perplexity          compute perplexity over each ctx window of the prompt\n");
-    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
-    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
-    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
-    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -i, --interactive     run in interactive mode\n");
+    fprintf(stdout, "  --interactive-first   run in interactive mode and wait for input right away\n");
+    fprintf(stdout, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stdout, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
+    fprintf(stdout, "  -r PROMPT, --reverse-prompt PROMPT\n");
+    fprintf(stdout, "                        halt generation at PROMPT, return control in interactive mode\n");
+    fprintf(stdout, "                        (can be specified more than once for multiple prompts).\n");
+    fprintf(stdout, "  --color               colorise output to distinguish prompt and user input from generations\n");
+    fprintf(stdout, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stdout, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stdout, "  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stdout, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
+    fprintf(stdout, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
+    fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
+    fprintf(stdout, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
+    fprintf(stdout, "  --random-prompt       start with a randomized prompt.\n");
+    fprintf(stdout, "  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
+    fprintf(stdout, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
+    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
+    fprintf(stdout, "  -f FNAME, --file FNAME\n");
+    fprintf(stdout, "                        prompt file to start generation.\n");
+    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    fprintf(stdout, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    fprintf(stdout, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stdout, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stdout, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stdout, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stdout, "  --mirostat N          use Mirostat sampling.\n");
+    fprintf(stdout, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+    fprintf(stdout, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    fprintf(stdout, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stdout, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stdout, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    fprintf(stdout, "                        modifies the likelihood of token appearing in the completion,\n");
+    fprintf(stdout, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
+    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+    fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
+    fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT\n");
+    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-negative-prompt-file FNAME\n");
+    fprintf(stdout, "                        negative prompt file to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
+    fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
+    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
+    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
+    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
+    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
-        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported()) {
-        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    printf("  --numa                attempt optimizations that help on some NUMA systems\n");
-    printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
-    printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stdout, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
+    fprintf(stdout, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    printf("  -ngl N, --n-gpu-layers N\n");
-    printf("                        number of layers to store in VRAM\n");
-    printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
 #ifdef GGML_USE_CUBLAS
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
-    printf("  --mtest               compute maximum memory usage\n");
-    printf("  --export              export the computation graph to 'llama.ggml'\n");
-    printf("  --verbose-prompt      print prompt before generation\n");
+    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
+    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
+    fprintf(stdout, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
-    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: %s)\n", params.model.c_str());
-    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
-    printf("  -ld LOGDIR, --logdir LOGDIR\n");
-    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "  -md FNAME, --model-draft FNAME\n");
+    fprintf(stdout, "                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "  -ld LOGDIR, --logdir LOGDIR\n");
+    fprintf(stdout, "                        path under which to save YAML logs (no logging if unset)\n");
+    fprintf(stdout, "\n");
 }

 std::string gpt_random_prompt(std::mt19937 & rng) {
@@ -717,9 +717,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param

    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
-    if (params.n_gpu_layers != -1) {
-        lparams.n_gpu_layers = params.n_gpu_layers;
-    }
+    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
    lparams.low_vram        = params.low_vram;
@@ -772,7 +770,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    {
        LOG("warming up the model with an empty run\n");

-        const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+        const std::vector<llama_token> tmp = { llama_token_bos(lctx), };
        llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(lctx);
    }
@@ -1214,7 +1212,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
-    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
+    fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
@@ -34,7 +34,7 @@ struct gpt_params {
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
-    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
@@ -513,16 +513,16 @@ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string &

 inline void log_print_usage()
 {
-    printf("log options:\n");
+    fprintf(stdout, "log options:\n");
    /* format
-    printf("  -h, --help            show this help message and exit\n");*/
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");*/
    /* spacing
-    printf("__-param----------------Description\n");*/
-    printf("  --log-test            Run simple logging test\n");
-    printf("  --log-disable         Disable trace logs\n");
-    printf("  --log-enable          Enable trace logs\n");
-    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
+    fprintf(stdout, "__-param----------------Description\n");*/
+    fprintf(stdout, "  --log-test            Run simple logging test\n");
+    fprintf(stdout, "  --log-disable         Disable trace logs\n");
+    fprintf(stdout, "  --log-enable          Enable trace logs\n");
+    fprintf(stdout, "  --log-file            Specify a log filename (without extension)\n");
+    fprintf(stdout, "                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
 }

 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
@@ -55,10 +55,10 @@ def count_model_parts(dir_model: Path) -> int:

 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
-    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
-    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
-    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
    return parser.parse_args()

 args = parse_args()
@@ -673,7 +673,7 @@ class LazyUnpickler(pickle.Unpickler):
        assert isinstance(pid[1], LazyStorageKind)
        data_type = pid[1].data_type
        filename_stem = pid[2]
-        filename = f'{self.data_base_path}/{filename_stem}'
+        filename = self.data_base_path + '/' + filename_stem
        info = self.zip_file.getinfo(filename)

        def load(offset: int, elm_count: int) -> NDArray:
@@ -689,6 +689,7 @@ class LazyUnpickler(pickle.Unpickler):

    @staticmethod
    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               # pyright: ignore[reportSelfClsParameterName]
                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
        assert isinstance(storage, LazyStorage)

@@ -76,7 +76,7 @@ bool gguf_ex_write(const std::string & fname) {

    gguf_write_to_file(ctx, fname.c_str(), false);

-    printf("%s: wrote file '%s;\n", __func__, fname.c_str());
+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());

    ggml_free(ctx_data);
    gguf_free(ctx);
@@ -93,20 +93,20 @@ bool gguf_ex_read_0(const std::string & fname) {

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

-    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
-    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
-    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }

@@ -116,10 +116,10 @@ bool gguf_ex_read_0(const std::string & fname) {

        const int keyidx = gguf_find_key(ctx, findkey);
        if (keyidx == -1) {
-            printf("%s: find key: %s not found.\n", __func__, findkey);
+            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
        } else {
            const char * key_value = gguf_get_val_str(ctx, keyidx);
-            printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
+            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
        }
    }

@@ -127,13 +127,13 @@ bool gguf_ex_read_0(const std::string & fname) {
    {
        const int n_tensors = gguf_get_n_tensors(ctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }

@@ -153,20 +153,20 @@ bool gguf_ex_read_1(const std::string & fname) {

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

-    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
-    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
-    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }

@@ -174,13 +174,13 @@ bool gguf_ex_read_1(const std::string & fname) {
    {
        const int n_tensors = gguf_get_n_tensors(ctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }

@@ -189,13 +189,13 @@ bool gguf_ex_read_1(const std::string & fname) {
        const int n_tensors = gguf_get_n_tensors(ctx);

        for (int i = 0; i < n_tensors; ++i) {
-            printf("%s: reading tensor %d data\n", __func__, i);
+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);

            const char * name = gguf_get_tensor_name(ctx, i);

            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);

            // print first 10 elements
            const float * data = (const float *) cur->data;
@@ -219,7 +219,7 @@ bool gguf_ex_read_1(const std::string & fname) {
        }
    }

-    printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));

    ggml_free(ctx_data);
    gguf_free(ctx);
@@ -229,7 +229,7 @@ bool gguf_ex_read_1(const std::string & fname) {

 int main(int argc, char ** argv) {
    if (argc < 3) {
-        printf("usage: %s data.gguf r|w\n", argv[0]);
+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
        return -1;
    }

@@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)

    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
+        fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
+//        fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
    }

    return cur;
@@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        return false;
    }

-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
+    fprintf(stdout, "%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
+    fprintf(stdout, "%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
+    fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));

    // print all kv
    #if 0
    {
        const int n_kv = gguf_get_n_kv(ggufctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ggufctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    #endif
@@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        int keyidx;

        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

    // check required metadata
@@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
+                fprintf(stdout, "%s: model architecture not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
+            fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
            return false;
        }

@@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
-                printf("%s: model tensor data layout not supported!\n", __func__);
+                fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: gguf model tensor data layout not found!\n", __func__);
+            fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__);
            return false;
        }

@@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_

        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
+                fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
+            fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
            return false;
        }

@@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");

        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
            return false;
        }

        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");

        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
            return false;
        }

        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);

-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
+        fprintf(stdout, "%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
+        fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);

        for (size_t i = 0; i < hparams.n_vocab; i++) {
            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }

-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
+        if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
+        if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
+        if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
+        if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
+        if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
+        if( vocab.linefeed_id    != -1 ) { fprintf(stdout, "%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }

    }

@@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
    {
        const int n_tensors = gguf_get_n_tensors(ggufctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ggufctx, i);
            const size_t offset = gguf_get_tensor_offset(ggufctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    #endif
@@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)

    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
+        fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
+//        fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
    }

    return cur;
@@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        return false;
    }

-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
+    fprintf(stdout, "%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
+    fprintf(stdout, "%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
+    fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));

    // print all kv
    #if 0
    {
        const int n_kv = gguf_get_n_kv(ggufctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ggufctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    #endif
@@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        int keyidx;

        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

    // check required metadata
@@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
+                fprintf(stdout, "%s: model architecture not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
+            fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
            return false;
        }

@@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2

        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
+                fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
+            fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
            return false;
        }

@@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");

        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
            return false;
        }

        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");

        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
            return false;
        }

        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);

-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
+        fprintf(stdout, "%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
+        fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);

        for (size_t i = 0; i < hparams.n_vocab; i++) {
            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }

-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
+        if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
+        if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
+        if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
+        if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
+        if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
+        if( vocab.linefeed_id    != -1 ) { fprintf(stdout, "%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
    }


@@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
    {
        const int n_tensors = gguf_get_n_tensors(ggufctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ggufctx, i);
            const size_t offset = gguf_get_tensor_offset(ggufctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    #endif
@@ -165,26 +165,26 @@ static const cmd_params cmd_params_defaults = {
 };

 static void print_usage(int /* argc */, char ** argv) {
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help\n");
-    printf("  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
-    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
-    printf("  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
-    printf("  -ts, --tensor_split <ts0/ts1/..>               \n");
-    printf("  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
-    printf("  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
-    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
-    printf("\n");
-    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help\n");
+    fprintf(stdout, "  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    fprintf(stdout, "  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    fprintf(stdout, "  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    fprintf(stdout, "  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    fprintf(stdout, "  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    fprintf(stdout, "  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    fprintf(stdout, "  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    fprintf(stdout, "  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
+    fprintf(stdout, "  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+    fprintf(stdout, "  -ts, --tensor_split <ts0/ts1/..>               \n");
+    fprintf(stdout, "  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
+    fprintf(stdout, "  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
+    fprintf(stdout, "  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");

 }

@@ -151,6 +151,14 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
    }

+    if (params.n_ctx > 2048) {
+        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
+        LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -186,13 +194,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.n_ctx > llama_n_ctx(ctx)) {
-        LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx);
-    } else if (params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
-    }
-
    // print system information
    {
        LOG_TEE("\n");
@@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        const int first = params.n_ctx/2;
+        const int first = std::min(512, params.n_ctx/2);
        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
        count += params.n_ctx - first - 1;
@@ -668,6 +668,11 @@ int main(int argc, char ** argv) {
        params.n_ctx += params.ppl_stride/2;
    }

+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+
    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -693,11 +698,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);"
-                "expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx);
-    }
-
    // print system information
    {
        fprintf(stderr, "\n");
@@ -145,29 +145,7 @@
      color: #888;
    }

-
-    @keyframes loading-bg-wipe {
-      0% {
-        background-position: 0%;
-      }
-      100% {
-        background-position: 100%;
-      }
-    }
-
-    .loading {
-      --loading-color-1: #eeeeee00;
-      --loading-color-2: #eeeeeeff;
-      background-size: 50% 100%;
-      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
-      animation: loading-bg-wipe 2s linear infinite;
-    }
-
    @media (prefers-color-scheme: dark) {
-      .loading {
-        --loading-color-1: #22222200;
-        --loading-color-2: #222222ff;
-      }
      .popover-content {
        background-color: black;
      }
@@ -343,10 +321,7 @@
    const llamaStats = signal(null)
    const controller = signal(null)

-    // currently generating a completion?
-    const generating = computed(() => controller.value != null)
-
-    // has the user started a chat?
+    const generating = computed(() => controller.value == null )
    const chatStarted = computed(() => session.value.transcript.length > 0)

    const transcriptUpdate = (transcript) => {
@@ -455,19 +430,11 @@
      return html`
        <form onsubmit=${submit}>
          <div>
-            <textarea
-               className=${generating.value ? "loading" : null}
-               oninput=${(e) => message.value = e.target.value}
-               onkeypress=${enterSubmits}
-               placeholder="Say something..."
-               rows=2
-               type="text"
-               value="${message}"
-            />
+            <textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
          </div>
          <div class="right">
-            <button type="submit" disabled=${generating.value}>Send</button>
-            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+            <button type="submit" disabled=${!generating.value} >Send</button>
+            <button onclick=${stop} disabled=${generating}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
        </form>
@@ -118,7 +118,7 @@ static void server_log(const char *level, const char *function, int line,
    }

    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
-    printf("%.*s\n", (int)str.size(), str.data());
+    fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
    fflush(stdout);
 }

@@ -694,50 +694,50 @@ struct llama_server_context
 static void server_print_usage(const char *argv0, const gpt_params &params,
                               const server_params &sparams)
 {
-    printf("usage: %s [options]\n", argv0);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "usage: %s [options]\n", argv0);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
    {
-        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported())
    {
-        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    printf("  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    printf("  -ngl N, --n-gpu-layers N\n");
-    printf("                        number of layers to store in VRAM\n");
-    printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: %s)\n", params.model.c_str());
-    printf("  -a ALIAS, --alias ALIAS\n");
-    printf("                        set an alias for the model, will be added as `model` field in completion response\n");
-    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
-    printf("  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
-    printf("  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
-    printf("  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
-    printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
-    printf("\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "  -a ALIAS, --alias ALIAS\n");
+    fprintf(stdout, "                        set an alias for the model, will be added as `model` field in completion response\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
+    fprintf(stdout, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
+    fprintf(stdout, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
+    fprintf(stdout, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
+    fprintf(stdout, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+    fprintf(stdout, "\n");
 }

 static void server_params_parse(int argc, char **argv, server_params &sparams,
@@ -1595,7 +1595,7 @@ int main(int argc, char **argv)
    svr.set_base_dir(sparams.public_path);

    // to make it ctrl+clickable:
-    printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
+    fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);

    LOG_INFO("HTTP server listening", {
                                          {"hostname", sparams.hostname},
@@ -6,7 +6,6 @@

 #include "common.h"
 #include "llama.h"
-#include "grammar-parser.h"

 #include <cmath>
 #include <cstdio>
@@ -110,35 +109,16 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

-    // grammar stuff
-    struct llama_grammar * grammar_dft = NULL;
-    struct llama_grammar * grammar_tgt = NULL;
-
-    grammar_parser::parse_state parsed_grammar;
-
-    // if requested - load the grammar, error checking is omitted for brevity
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
    const auto t_dec_start = ggml_time_us();

    while (true) {
        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));

+        // sample from the drafted tokens if any
        int i_dft = 0;
        while (true) {
-            // sample from the target model
-            const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
+            const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft);

-            // remember which tokens were sampled - used for repetition penalties during sampling
            last_tokens.erase(last_tokens.begin());
            last_tokens.push_back(id);

@@ -154,9 +134,8 @@ int main(int argc, char ** argv) {

            ++n_predict;

-            // check if the draft matches the target
            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                LOG("drafted token %d accepted\n", id);
                ++n_accept;
                ++n_past_tgt;
                ++n_past_dft;
@@ -166,14 +145,6 @@ int main(int argc, char ** argv) {
            }

            // the drafted token was rejected or we are out of drafted tokens
-
-            if (i_dft < (int) drafted.size()) {
-                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
-                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
-            } else {
-                LOG("out of drafted tokens\n");
-            }
-
            llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
            ++n_past_dft;

@@ -187,16 +158,7 @@ int main(int argc, char ** argv) {
            break;
        }

-        if (grammar_tgt) {
-            if (grammar_dft) {
-                llama_grammar_free(grammar_dft);
-            }
-            grammar_dft = llama_grammar_copy(grammar_tgt);
-
-            LOG("copied target grammar to draft grammar\n");
-        }
-
-        // sample n_draft tokens from the draft model using greedy decoding
+        // sample n_draft tokens from the draft model picking the best token
        int n_past_cur = n_past_dft;
        for (int i = 0; i < n_draft; ++i) {
            float * logits = llama_get_logits(ctx_dft);
@@ -208,40 +170,25 @@ int main(int argc, char ** argv) {

            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };

-            if (grammar_dft != NULL) {
-                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
-            }
-
            // computes softmax and sorts the candidates
            llama_sample_softmax(ctx_dft, &cur_p);

            for (int i = 0; i < 3; ++i) {
-                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
+                LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p);
            }

-            // TODO: better logic?
+            // too low probability, stop drafting
            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
-                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
                break;
            }

-            // drafted token
-            const llama_token id = cur_p.data[0].id;
-
-            drafted.push_back(id);
+            drafted.push_back(cur_p.data[0].id);
            ++n_drafted;

-            // no need to evaluate the last drafted token, since we won't use the result
-            if (i == n_draft - 1) {
-                break;
-            }
-
-            // evaluate the drafted token on the draft model
-            llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
-            ++n_past_cur;
-
-            if (grammar_dft != NULL) {
-                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
+            if (i < n_draft - 1) {
+                // evaluate the drafted token on the draft model
+                llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
+                ++n_past_cur;
            }
        }

@@ -249,7 +196,6 @@ int main(int argc, char ** argv) {
        llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
        ++n_past_tgt;

-        // the first token is always proposed by the traget model before the speculation loop
        drafted.erase(drafted.begin());
    }

@@ -280,10 +226,6 @@ int main(int argc, char ** argv) {
    llama_free(ctx_dft);
    llama_free_model(model_dft);

-    if (grammar_dft != NULL) {
-        llama_grammar_free(grammar_dft);
-        llama_grammar_free(grammar_tgt);
-    }
    llama_backend_free();

    fprintf(stderr, "\n\n");
@@ -1,8 +1,3 @@
-// defines MAP_ANONYMOUS
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
 #include "ggml-alloc.h"
 #include "ggml.h"
 #include <assert.h>
@@ -11,26 +6,6 @@
 #include <stdlib.h>
 #include <string.h>

-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/types.h>
-            #include <sys/mman.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <memoryapi.h>
-#endif
-
-
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -124,24 +99,19 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif

-static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+
+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    return ggml_nbytes(tensor);

    UNUSED(alloc);
 }

-// check if a tensor is allocated by this buffer
-static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
-    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
-}
-
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 #ifdef GGML_ALLOCATOR_DEBUG
    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
 #endif
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);

    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -207,17 +177,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    void * ptr = tensor->data;

-    if (ggml_allocr_is_own(alloc, tensor) == false) {
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
        // the tensor was not allocated in this buffer
        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
        // the easiest way to deal with this is just to ignore it
        return;
    }

-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);

@@ -311,64 +281,24 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
    return alloc;
 }

-// OS specific functions to allocate and free uncommitted virtual memory
-static void * alloc_vmem(size_t size) {
-#if defined(_WIN32)
-    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
-#elif defined(_POSIX_MAPPED_FILES)
-    return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+// address and size of the buffer when measuring
+// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+#if defined(__ARM_NEON) && !defined(__aarch64__)
+// 32-bit
+// TODO: Use for 32-bit x86 as well
+static const size_t MEASURE_MAX_SIZE  = (1ULL<<32) - 1; // 4 GB
 #else
-    // use a fixed address for other platforms
-    uintptr_t base_addr = (uintptr_t)-size - 0x100;
-    return (void *)base_addr;
+// 64-bit
+static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
 #endif
-}
-
-static void free_vmem(void * base_addr, size_t size) {
-#if defined(_WIN32)
-    VirtualFree(base_addr, 0, MEM_RELEASE);
-    UNUSED(size);
-#elif defined(_POSIX_MAPPED_FILES)
-    munmap(base_addr, size);
-#else
-    // nothing to do
-    UNUSED(base_addr);
-    UNUSED(size);
-#endif
-}
-
-// allocate uncommitted virtual memory to measure the size of the graph
-static void alloc_measure_vmem(void ** base_addr, size_t * size) {
-    // 1TB for 64-bit, 1GB for 32-bit
-    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
-    do {
-        *base_addr = alloc_vmem(*size);
-        if (*base_addr != NULL) {
-            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
-            return;
-        }
-        // try again with half the size
-        *size /= 2;
-    } while (*size > 0);
-
-    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
-}
-
-static void free_measure_vmem(void * base_addr, size_t size) {
-    free_vmem(base_addr, size);
-}

 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);

-    void * base_addr;
-    size_t size;
-
-    alloc_measure_vmem(&base_addr, &size);
-
    *alloc = (struct ggml_allocr){
-        /*.data          = */ base_addr,
-        /*.size          = */ size,
+        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.size          = */ MEASURE_MAX_SIZE,
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
@@ -388,9 +318,6 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }

 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->measure) {
-        free_measure_vmem(alloc->data, alloc->size);
-    }
    free(alloc);
 }

@@ -460,7 +387,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                    }

                    // if the node's data is external, then we cannot re-use it
-                    if (ggml_allocr_is_own(alloc, parent) == false) {
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                        continue;
                    }
@@ -494,7 +422,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    }
 }

-static size_t ggml_allocr_alloc_graph_tensors_n(
+static size_t ggml_allocator_alloc_graph_tensors_n(
    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -572,10 +500,11 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
                AT_PRINTF("\n");
            }

+
            // update parents
            // update immediately if there is no parse_seq
            // update only at barriers if there is parse_seq
-            if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
                for (int i = update_start; i < update_end; i++) {
@@ -599,12 +528,12 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
                                view_src_hn->n_views -= 1;
                                AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
-                                    ggml_allocr_free_tensor(alloc, view_src);
+                                    ggml_allocator_free_tensor(alloc, view_src);
                                }
                            }
                            else {
                                if (parent->data != node->data) {
-                                    ggml_allocr_free_tensor(alloc, parent);
+                                    ggml_allocator_free_tensor(alloc, parent);
                                }
                            }
                        }
@@ -621,7 +550,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
            for (int i = 0; outputs[g][i] != NULL; i++) {
                struct ggml_tensor * output = outputs[g][i];
                AT_PRINTF("output: %s\n", output->name);
-                ggml_allocr_free_tensor(alloc, output);
+                ggml_allocator_free_tensor(alloc, output);
            }
        }
    }
@@ -630,5 +559,5 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 }

 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
@@ -464,91 +464,58 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
    dst[i] = x[i] / (1.0f + expf(-x[i]));
 }

-static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
-    }
-    return a;
-}
-
-template <int block_size>
 static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

    const float eps = 1e-5f;

-    float2 mean_var = make_float2(0.f, 0.f);
+    float mean = 0.0f;
+    float var = 0.0f;

-    for (int col = tid; col < ncols; col += block_size) {
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
        const float xi = x[row*ncols + col];
-        mean_var.x += xi;
-        mean_var.y += xi * xi;
+        mean += xi;
+        var += xi * xi;
    }

    // sum up partial sums
-    mean_var = warp_reduce_sum(mean_var);
-    if (block_size > WARP_SIZE) {
-        __shared__ float2 s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
-        }
-        __syncthreads();
-        mean_var = s_sum[lane_id];
-        mean_var = warp_reduce_sum(mean_var);
-    }
-
-    const float mean = mean_var.x / ncols;
-    const float var = mean_var.y / ncols - mean * mean;
-    const float inv_std = rsqrtf(var + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-    }
-}
-
-static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
+        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
+    }
+
+    mean /= ncols;
+    var = var / ncols - mean * mean;
+    const float inv_var = rsqrtf(var + eps);
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
    }
-    return x;
 }

-template <int block_size>
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;

    float tmp = 0.0f; // partial sum for thread in warp

-    for (int col = tid; col < ncols; col += block_size) {
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
        const float xi = x[row*ncols + col];
        tmp += xi * xi;
    }

    // sum up partial sums
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
    }

    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);

-    for (int col = tid; col < ncols; col += block_size) {
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
        dst[row*ncols + col] = scale * x[row*ncols + col];
    }
 }
@@ -4236,24 +4203,14 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_

 static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
-    }
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 }

 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    }
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 }

 static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
        return;
    }

-    cl_mem mem = (cl_mem)tensor->extra;
+    cl_mem mem = (cl_mem)tensor->data;
    clReleaseMemObject(mem);
 }

@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    size_t d_size;

    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
+    cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst


@@ -1491,7 +1491,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
+        d_X = (cl_mem) src0->data;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
    }
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
+        d_X = (cl_mem) src0->data;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
    }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                events.emplace_back();
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
            } else if (src0->backend == GGML_BACKEND_GPU) {
-                d_Q = (cl_mem) src0->extra;
+                d_Q = (cl_mem) src0->data;
            } else {
                GGML_ASSERT(false);
            }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {

    CL_CHECK(clFinish(queue));

-    tensor->extra = dst;
+    tensor->data = dst;
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
@@ -1,34 +0,0 @@
-# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
-# Useful for generating JSON arrays
-
-root   ::= arr
-value  ::= object | array | string | number | ("true" | "false" | "null") ws
-
-arr  ::=
-  "[\n" ws (
-            value
-    (",\n" ws value)*
-  )? "]"
-
-object ::=
-  "{" ws (
-            string ":" ws value
-    ("," ws string ":" ws value)*
-  )? "}" ws
-
-array  ::=
-  "[" ws (
-            value
-    ("," ws value)*
-  )? "]" ws
-
-string ::=
-  "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
-  )* "\"" ws
-
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
-
-# Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
@@ -83,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
        float ax = fabsf(x[i]);
        if (ax > amax) { amax = ax; max = x[i]; }
    }
-    if (amax < 1e-30f) { // all zero
+    if (!amax) { // all zero
        for (int i = 0; i < n; ++i) {
            L[i] = 0;
        }
@@ -1086,12 +1086,6 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict

        }

-        if (!max_abs_scale) {
-            memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = ggml_fp32_to_fp16(0.f);
-            continue;
-        }
-
        float iscale = -128.f/max_scale;
        y[i].d = ggml_fp32_to_fp16(1/iscale);
        for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -2942,12 +2942,7 @@ static bool llama_eval_internal(

    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-    //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-    //       with the BLAS calls. need a better solution
-    if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-        n_threads = std::min(4, n_threads);
-    }
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;

    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@@ -3855,25 +3850,6 @@ void llama_grammar_free(struct llama_grammar * grammar) {
    delete grammar;
 }

-struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
-
-    // redirect elements in stacks to point to new rules
-    for (size_t is = 0; is < result->stacks.size(); is++) {
-        for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
-            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
-                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
-                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
-                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
-                    }
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
 //
 // sampling
 //
@@ -5364,7 +5340,7 @@ struct llama_context_params llama_context_default_params() {
        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
        /*.n_ctx                       =*/ 512,
        /*.n_batch                     =*/ 512,
-        /*.n_gpu_layers                =*/ 0,
+        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
        /*.rope_freq_base              =*/ 10000.0f,
@@ -5381,10 +5357,6 @@ struct llama_context_params llama_context_default_params() {
        /*.embedding                   =*/ false,
    };

-#ifdef GGML_USE_METAL
-    result.n_gpu_layers = 1;
-#endif
-
    return result;
 }

@@ -5577,44 +5549,44 @@ struct llama_context * llama_new_context_with_model(
            }
 #endif
        }
+    }

 #ifdef GGML_USE_METAL
-        if (params.n_gpu_layers > 0) {
-            // this allocates all Metal resources and memory buffers
+    if (params.n_gpu_layers > 0) {
+        // this allocates all Metal resources and memory buffers

-            void * data_ptr  = NULL;
-            size_t data_size = 0;
+        void * data_ptr  = NULL;
+        size_t data_size = 0;

-            if (params.use_mmap) {
-                data_ptr  = ctx->model.mapping->addr;
-                data_size = ctx->model.mapping->size;
-            } else {
-                data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
-                data_size = ggml_get_mem_size  (ctx->model.ctx);
-            }
+        if (params.use_mmap) {
+            data_ptr  = ctx->model.mapping->addr;
+            data_size = ctx->model.mapping->size;
+        } else {
+            data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
+            data_size = ggml_get_mem_size  (ctx->model.ctx);
+        }

-            const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);

-            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+        LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);

 #define LLAMA_METAL_CHECK_BUF(result)                            \
-            if (!(result)) {                                             \
-                LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
-                llama_free(ctx);                                         \
-                return NULL;                                             \
-            }
-
-            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
-
-            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
-            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
-
-            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
-#undef LLAMA_METAL_CHECK_BUF
-        }
-#endif
+    if (!(result)) {                                             \
+        LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
+        llama_free(ctx);                                         \
+        return NULL;                                             \
    }

+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
+
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
+
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
+#undef LLAMA_METAL_CHECK_BUF
+    }
+#endif
+
 #ifdef GGML_USE_MPI
    ctx->ctx_mpi = ggml_mpi_init();

@@ -410,8 +410,6 @@ extern "C" {

    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);

-    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
-
    //
    // Sampling functions
    //
Author	SHA1	Message	Date
Georgi Gerganov	847896aba7	speculative : add --draft CLI arg	2023-09-03 13:51:07 +03:00
Georgi Gerganov	a15ca746c7	speculative : print encoding speed	2023-09-03 13:45:13 +03:00
Georgi Gerganov	c82c808da0	speculative : initial example	2023-09-03 13:44:31 +03:00