swift : fix build

ggml-ci
batched.swift : fix build
2026-06-30 17:47:40 +02:00 · 2024-02-23 19:02:09 +02:00 · 2024-02-23 16:15:37 +02:00 · 2024-02-23 14:25:05 +02:00 · 2024-02-23 12:34:16 +02:00 · 2024-02-22 17:05:23 -05:00
22 changed files with 243 additions and 150 deletions
@@ -218,6 +218,8 @@ class Model:
            return BertModel
        if model_architecture == "NomicBertModel":
            return NomicBertModel
+        if model_architecture == "GemmaForCausalLM":
+            return GemmaModel
        return Model

    def _is_model_safetensors(self) -> bool:
@@ -277,6 +279,8 @@ class Model:
            return gguf.MODEL_ARCH.BERT
        if arch == "NomicBertModel":
            return gguf.MODEL_ARCH.NOMIC_BERT
+        if arch == "GemmaForCausalLM":
+            return gguf.MODEL_ARCH.GEMMA

        raise NotImplementedError(f'Architecture "{arch}" not supported!')

@@ -618,11 +622,6 @@ class MPTModel(Model):

            self.gguf_writer.add_tensor(new_name, data)

-            # note: MPT output is tied to (same as) wte in original model;
-            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-            if new_name == "token_embd.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-

 class OrionModel(Model):
    def set_vocab(self):
@@ -1786,6 +1785,62 @@ class NomicBertModel(BertModel):
            yield name, data


+class GemmaModel(Model):
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_key_length(hparams["head_dim"])
+        self.gguf_writer.add_value_length(hparams["head_dim"])
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        for name, data_torch in self.get_tensors():
+            # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
+            if name.endswith("norm.weight"):
+                data_torch = data_torch + 1
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
 ###### CONVERSION LOGIC ######


@@ -1015,9 +1015,9 @@ static struct ggml_tensor * forward_lora(
    struct ggml_tensor * kc = kv_self.k;
    struct ggml_tensor * vc = kv_self.v;

-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N);
    {
-        int * data = (int *) KQ_pos->data;
+        float * data = (float *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
        }
@@ -79,7 +79,7 @@ batch.n_tokens = Int32(tokens.count)

 for (i, token) in tokens.enumerated() {
    batch.token[i] = token
-    batch.pos[i] = Int32(i)
+    batch.pos[i] = llama_pos(i)
    batch.n_seq_id[i] = 1
    // batch.seq_id[i][0] = 0
    // TODO: is this the proper way to do this?
@@ -98,7 +98,7 @@ if llama_decode(context, batch) != 0 {
 }

 for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, llama_pos(batch.n_tokens))
 }

 if n_parallel > 1 {
@@ -125,8 +125,8 @@ while n_cur <= n_len {
            continue
        }

-        var n_vocab = llama_n_vocab(model)
-        var logits = llama_get_logits_ith(context, i_batch[i])
+        let n_vocab = llama_n_vocab(model)
+        let logits = llama_get_logits_ith(context, i_batch[i])

        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))

@@ -173,7 +173,7 @@ while n_cur <= n_len {

        // push this new token for next evaluation
        batch.token[Int(batch.n_tokens)] = new_token_id
-        batch.pos[Int(batch.n_tokens)] = n_cur
+        batch.pos[Int(batch.n_tokens)] = llama_pos(n_cur)
        batch.n_seq_id[Int(batch.n_tokens)] = 1
        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
            seq_id[0] = Int32(i)
@@ -554,7 +554,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    };

    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N);
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
@@ -743,7 +743,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(

        // set KQ_pos
        {
-            int * data = (int *) KQ_pos->data;
+            float * data = (float *) KQ_pos->data;
            for (int i = 0; i < N; ++i) {
                data[i] = n_past + i;
            }
@@ -129,7 +129,7 @@ actor LlamaContext {

        for i1 in 0..<tokens_list.count {
            let i = Int(i1)
-            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
+            llama_batch_add(&batch, tokens_list[i], llama_pos(i), [0], false)
        }
        batch.logits[Int(batch.n_tokens) - 1] = 1 // true

@@ -183,7 +183,7 @@ actor LlamaContext {
        // tokens_list.append(new_token_id)

        llama_batch_clear(&batch)
-        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
+        llama_batch_add(&batch, new_token_id, llama_pos(n_cur), [0], true)

        n_decode += 1
        n_cur    += 1
@@ -210,7 +210,7 @@ actor LlamaContext {
            let n_tokens = pp

            for i in 0..<n_tokens {
-                llama_batch_add(&batch, 0, Int32(i), [0], false)
+                llama_batch_add(&batch, 0, llama_pos(i), [0], false)
            }
            batch.logits[Int(batch.n_tokens) - 1] = 1 // true

@@ -234,7 +234,7 @@ actor LlamaContext {
                llama_batch_clear(&batch)

                for j in 0..<pl {
-                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
+                    llama_batch_add(&batch, 0, llama_pos(i), [Int32(j)], true)
                }

                if llama_decode(context, batch) != 0 {
@@ -338,7 +338,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, (float) *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
@@ -1281,7 +1281,7 @@ struct llama_server_context
                }

                const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
+                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, (float) slot.n_past, 1, 0, };
                if (llama_decode(ctx, batch_img))
                {
                    LOG_TEE("%s : failed to eval image\n", __func__);
@@ -291,7 +291,7 @@ static struct ggml_tensor * llama_build_train_graphs(
    };

    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N);
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
@@ -419,7 +419,7 @@ static struct ggml_tensor * llama_build_train_graphs(
            ggml_gallocr_alloc_graph(alloc, gb);

            if (!measure_only) {
-                int * data = (int *) KQ_pos->data;
+                float * data = (float *) KQ_pos->data;
                for (int i = 0; i < N; ++i) {
                    data[i] = n_past + i;
                }
@@ -1,3 +1,7 @@
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
 #include <algorithm>
 #include <assert.h>
 #include <atomic>
@@ -121,11 +125,6 @@

 #endif // defined(GGML_USE_HIPBLAS)

-// ggml-cuda need half type so keep ggml headers include at last
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
 #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

 #define CC_PASCAL     600
@@ -6041,7 +6040,7 @@ static __device__ void rope_yarn(
 // rope == RoPE == rotary positional embedding
 template<typename T, bool has_pos>
 static __global__ void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    const T * x, T * dst, int ncols, const float * pos, float freq_scale, int p_delta_rows, float freq_base,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@@ -6054,7 +6053,7 @@ static __global__ void rope(
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;

-    const int p = has_pos ? pos[i2] : 0;
+    const float p = has_pos ? pos[i2] : 0.0f;
    const float theta_base = p*powf(freq_base, -float(col)/ncols);

    float cos_theta, sin_theta;
@@ -6069,7 +6068,7 @@ static __global__ void rope(

 template<typename T, bool has_pos>
 static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int n_dims, const float * pos, float freq_scale, int p_delta_rows,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@@ -6096,7 +6095,7 @@ static __global__ void rope_neox(

    float cur_rot = inv_ndims * ic - ib;

-    const int p = has_pos ? pos[i2] : 0;
+    const float p = has_pos ? pos[i2] : 0.0f;
    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);

    float cos_theta, sin_theta;
@@ -6110,7 +6109,7 @@ static __global__ void rope_neox(
 }

 static __global__ void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    const float * x, float * dst, int ncols, const float * pos, float freq_scale, int p_delta_rows, float freq_base,
    int n_ctx
 ) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
@@ -6125,10 +6124,10 @@ static __global__ void rope_glm_f32(
    const int i2 = row/p_delta_rows;

    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;

-    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
+    const float p = pos != nullptr ? pos[i2] : 0.0f;
+
+    const float theta = min(p, (float) n_ctx - 2)*freq_scale*col_theta_scale;
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);

@@ -6138,7 +6137,7 @@ static __global__ void rope_glm_f32(
    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;

-    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
+    const float block_theta = max(p - n_ctx - 2, 0.0f)*col_theta_scale;
    const float sin_block_theta = sinf(block_theta);
    const float cos_block_theta = cosf(block_theta);

@@ -7689,7 +7688,7 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const

 template<typename T>
 static void rope_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int nrows, const float * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 2 == 0);
@@ -7709,7 +7708,7 @@ static void rope_cuda(

 template<typename T>
 static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int n_dims, int nrows, const float * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 2 == 0);
@@ -7734,7 +7733,7 @@ static void rope_neox_cuda(
 }

 static void rope_glm_f32_cuda(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const float * x, float * dst, int ncols, int nrows, const float * pos, float freq_scale, int p_delta_rows,
    float freq_base, int n_ctx, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 4 == 0);
@@ -9036,11 +9035,11 @@ static void ggml_cuda_op_rope(
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));

-    const int32_t * pos = nullptr;
+    const float * pos = nullptr;
    if ((mode & 1) == 0) {
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
        GGML_ASSERT(src1->ne[0] == ne2);
-        pos = (const int32_t *) src1_dd;
+        pos = (const float *) src1_dd;
    }

    const bool is_neox = mode & 2;
@@ -53,11 +53,23 @@ extern "C" {
 //
 #include <arm_neon.h>

-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    __fp16 tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    ggml_fp16_t res;
+    __fp16 tmp = f;
+    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+    return res;
+}

 #else

@@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
+#if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
@@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 }

 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif

+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif

 #define GGML_HASHTABLE_FULL ((size_t)-1)
@@ -2057,7 +2057,13 @@ static bool ggml_metal_graph_compute(
                        // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];

-                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                        float freq_base;
+                        float freq_scale;
+                        float ext_factor;
+                        float attn_factor;
+                        float beta_fast;
+                        float beta_slow;
+
                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
@@ -1674,7 +1674,7 @@ static void rope_yarn_corr_dims(

 typedef void (rope_t)(
        device const    void * src0,
-        device const int32_t * src1,
+        device const   float * src1,
        device         float * dst,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -1709,7 +1709,7 @@ typedef void (rope_t)(
 template<typename T>
 kernel void kernel_rope(
        device const    void * src0,
-        device const int32_t * src1,
+        device const   float * src1,
        device         float * dst,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -1749,11 +1749,11 @@ kernel void kernel_rope(
    float corr_dims[2];
    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);

-    device const int32_t * pos = src1;
+    device const float * pos = src1;

-    const int64_t p = pos[i2];
+    const float p = pos[i2];

-    const float theta_0 = (float)p;
+    const float theta_0 = p;
    const float inv_ndims = -1.f/n_dims;

    if (!is_neox) {
@@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

        const uint8_t * restrict q2 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
@@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

        const uint8_t * restrict q2 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
@@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r

        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);

        const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
        q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
@@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r

        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);

        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);

@@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        aux16[1] = (a[0] >> 4) & 0x0f0f;

        const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
-        sum_mins += y[i].d * (float)x[i].d[1] * summi;
+        sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;

-        const float d = y[i].d * (float)x[i].d[0];
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);

        const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);

@@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
        const int8_t * sc = x[i].scales;

        const uint8_t * restrict q5 = x[i].qs;
@@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
        const int8_t * sc = x[i].scales;

        const uint8_t * restrict q5 = x[i].qs;
@@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d_all = (float)x[i].d;
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);

        const uint8_t * restrict q6 = x[i].ql;
        const uint8_t * restrict qh = x[i].qh;
@@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d_all = (float)x[i].d;
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);

        const uint8_t * restrict q6 = x[i].ql;
        const uint8_t * restrict qh = x[i].qh;
@@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
    float sumf = 0;

    for (int ib = 0; ib < nb; ib += 2) {
-
        q4bits.val[0] = vld1q_u8(x[ib+0].qs);
        q4bits.val[1] = vld1q_u8(x[ib+1].qs);
        q8b.val[0]    = vld1q_s8(y[ib+0].qs);
@@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);

-        sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
-
+        sumf +=
+            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
+            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
    }

    *s = sumf;
@@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
 float ggml_fp16_to_fp32(ggml_fp16_t x) {
-    return (float) GGML_FP16_TO_FP32(x);
+    return GGML_FP16_TO_FP32(x);
 }

 ggml_fp16_t ggml_fp32_to_fp16(float x) {
@@ -355,6 +355,18 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
    }
 }

+static void ggml_i32_to_f32_row(const int32_t * x, float * y, int n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = (float) x[i];
+    }
+}
+
+static void ggml_f32_to_i32_row(const float * x, int32_t * y, int n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = (int32_t) x[i];
+    }
+}
+
 //
 // timing
 //
@@ -454,6 +466,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = 1,
        .type_size                = sizeof(int32_t),
        .is_quantized             = false,
+        .to_float                 = (ggml_to_float_t)   ggml_i32_to_f32_row,
+        .from_float               = (ggml_from_float_t) ggml_f32_to_i32_row,
+        .from_float_reference     = (ggml_from_float_t) ggml_f32_to_i32_row,
    },
    [GGML_TYPE_F32] = {
        .type_name                = "f32",
@@ -469,10 +484,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = 1,
        .type_size                = sizeof(ggml_fp16_t),
        .is_quantized             = false,
-        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+        .to_float                 = (ggml_to_float_t)   ggml_fp16_to_fp32_row,
        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
+        .vec_dot                  = (ggml_vec_dot_t)    ggml_vec_dot_f16,
        .vec_dot_type             = GGML_TYPE_F16,
        .nrows                    = 1,
    },
@@ -481,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q4_0,
+        .from_float               = (ggml_from_float_t) quantize_row_q4_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
@@ -497,8 +512,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK4_1,
        .type_size                = sizeof(block_q4_1),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
-        .from_float               = quantize_row_q4_1,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q4_1,
+        .from_float               = (ggml_from_float_t) quantize_row_q4_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
@@ -537,8 +552,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK5_0,
        .type_size                = sizeof(block_q5_0),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
-        .from_float               = quantize_row_q5_0,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q5_0,
+        .from_float               = (ggml_from_float_t) quantize_row_q5_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
@@ -549,8 +564,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK5_1,
        .type_size                = sizeof(block_q5_1),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
-        .from_float               = quantize_row_q5_1,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q5_1,
+        .from_float               = (ggml_from_float_t) quantize_row_q5_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
@@ -561,8 +576,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK8_0,
        .type_size                = sizeof(block_q8_0),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
-        .from_float               = quantize_row_q8_0,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q8_0,
+        .from_float               = (ggml_from_float_t) quantize_row_q8_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
@@ -577,7 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK8_1,
        .type_size                = sizeof(block_q8_1),
        .is_quantized             = true,
-        .from_float               = quantize_row_q8_1,
+        .from_float               = (ggml_from_float_t) quantize_row_q8_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
        .vec_dot_type             = GGML_TYPE_Q8_1,
        .nrows                    = 1,
@@ -587,8 +602,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q2_K),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
-        .from_float               = quantize_row_q2_K,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q2_K,
+        .from_float               = (ggml_from_float_t) quantize_row_q2_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
@@ -599,8 +614,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q3_K),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
-        .from_float               = quantize_row_q3_K,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q3_K,
+        .from_float               = (ggml_from_float_t) quantize_row_q3_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
@@ -611,8 +626,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q4_K),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
-        .from_float               = quantize_row_q4_K,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q4_K,
+        .from_float               = (ggml_from_float_t) quantize_row_q4_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
@@ -623,8 +638,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q5_K),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
-        .from_float               = quantize_row_q5_K,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q5_K,
+        .from_float               = (ggml_from_float_t) quantize_row_q5_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
@@ -635,8 +650,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q6_K),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
-        .from_float               = quantize_row_q6_K,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_q6_K,
+        .from_float               = (ggml_from_float_t) quantize_row_q6_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
@@ -671,9 +686,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq3_xxs),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
-        .from_float               = quantize_row_iq3_xxs,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_iq3_xxs,
+        .from_float               = (ggml_from_float_t) quantize_row_iq3_xxs,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_iq3_xxs_reference,
        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
@@ -695,9 +710,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .blck_size                = QK4_NL,
        .type_size                = sizeof(block_iq4_nl),
        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
-        .from_float               = quantize_row_iq4_nl,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_nl_reference,
+        .to_float                 = (ggml_to_float_t)   dequantize_row_iq4_nl,
+        .from_float               = (ggml_from_float_t) quantize_row_iq4_nl,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_iq4_nl_reference,
        .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
@@ -798,7 +813,7 @@ inline static float vaddvq_f32(float32x4_t v) {
    #define GGML_F16x8              float16x8_t
    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD         vld1q_f16
+    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
    #define GGML_F16x8_STORE        vst1q_f16
    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
    #define GGML_F16x8_ADD          vaddq_f16
@@ -841,7 +856,7 @@ inline static float vaddvq_f32(float32x4_t v) {
    #define GGML_F32Cx4              float32x4_t
    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
+    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
    #define GGML_F32Cx4_ADD          vaddq_f32
@@ -5254,7 +5269,7 @@ static struct ggml_tensor * ggml_rope_impl(
        bool                  xpos_down,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_vector(b));
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(b->type == GGML_TYPE_F32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);

    bool is_node = false;
@@ -5377,7 +5392,7 @@ struct ggml_tensor * ggml_rope_back(
        float                 xpos_base,
        bool                  xpos_down) {
    GGML_ASSERT(ggml_is_vector(b));
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(b->type == GGML_TYPE_F32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);

    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
@@ -12352,11 +12367,11 @@ static void ggml_compute_forward_rope_f32(
    // this essentially just switches the sign of sin.
    const float sin_sign = forward ? 1.0f : -1.0f;

-    const int32_t * pos = (const int32_t *) src1->data;
+    const float * pos = (const float *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
+            const float p = pos[i2];

            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
@@ -12523,11 +12538,11 @@ static void ggml_compute_forward_rope_f16(
    // this essentially just switches the sign of sin.
    const float sin_sign = forward ? 1.0f : -1.0f;

-    const int32_t * pos = (const int32_t *) src1->data;
+    const float * pos = (const float *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
+            const float p = pos[i2];

            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
@@ -315,13 +315,7 @@
 extern "C" {
 #endif

-#if defined(__ARM_NEON) && defined(__CUDACC__)
-    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-    typedef __fp16 ggml_fp16_t;
-#else
    typedef uint16_t ggml_fp16_t;
-#endif

    // convert FP16 <-> FP32
    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
@@ -509,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
@@ -1700,8 +1699,8 @@ struct llama_layer {
 };

 struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta = 0;
+    float pos   = -1.0f;
+    float delta =  0.0f;

    std::set<llama_seq_id> seq_id;

@@ -1940,10 +1939,10 @@ struct llama_context {
    ggml_context * ctx_input = nullptr;
    struct ggml_tensor * inp_tokens;    // I32 [n_batch]
    struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
-    struct ggml_tensor * inp_pos;       // I32 [n_batch]
+    struct ggml_tensor * inp_pos;       // F32 [n_batch]
    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
    struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
-    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
+    struct ggml_tensor * inp_K_shift;   // F32 [n_ctx]
    struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
    struct ggml_tensor * inp_cls;       // I32 [n_batch]

@@ -2223,7 +2222,7 @@ static void llama_kv_cache_seq_div(
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
-                          int   d) {
+                        float   d) {
    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();

@@ -4056,7 +4055,10 @@ static bool llm_load_tensors(
                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);

-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                    }

                    for (int i = 0; i < n_layer; ++i) {
@@ -5926,9 +5928,10 @@ struct llm_build_context {

        // get input vectors with right size
        const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+
+        struct ggml_tensor * inp_pos  = ggml_view_1d(ctx0, lctx.inp_pos,  n_tokens, 0);
        struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
-        struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
+        struct ggml_tensor * inp_cls  = ggml_view_1d(ctx0, lctx.inp_cls,  n_tokens, 0);

        // construct input embeddings (token, type, position)
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5936,8 +5939,9 @@ struct llm_build_context {
        // token types are hardcoded to zero ("Sentence A")
        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
        inpL = ggml_add(ctx0, inpL, type_row0);
+
        if (model.arch == LLM_ARCH_BERT) {
-            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, ggml_cast(ctx0, inp_pos, GGML_TYPE_I32)), inpL);
        }
        cb(inpL, "inp_embd", -1);

@@ -7450,6 +7454,7 @@ struct llm_build_context {

        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
        cb(inpL, "inp_embd", -1);
+
        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
        cb(inpL, "inp_scaled", -1);

@@ -7491,6 +7496,7 @@ struct llm_build_context {
                        n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur", il);
+
                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
                cb(Qcur, "Qcur_scaled", il);

@@ -7505,6 +7511,7 @@ struct llm_build_context {
                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
                cb(cur, "kqv_out", il);
            }
+
            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
            cb(sa_out, "sa_out", il);

@@ -7739,7 +7746,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {

        assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));

-        int32_t * data = (int32_t *) lctx.inp_K_shift->data;
+        float * data = (float *) lctx.inp_K_shift->data;

        for (int i = 0; i < n_ctx; ++i) {
            data[i] = lctx.kv_self.cells[i].delta;
@@ -10495,7 +10502,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        return std::make_pair(i_layer, n_layer);
    };

-    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+    // with the quantization of the output tensor
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
+        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
        int nx = tensor->ne[0];
        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
            new_type = GGML_TYPE_Q8_0;
@@ -11682,10 +11692,10 @@ struct llama_context * llama_new_context_with_model(

            ctx->inp_tokens  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
            ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
-            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
+            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch);
            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
-            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
+            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
            ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
            ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);

@@ -12038,7 +12048,7 @@ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, l
    llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
 }

-void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, float d) {
    if (d == 1) {
        return;
    }
@@ -12453,7 +12463,7 @@ int llama_eval_embd(
                         int32_t   n_past) {
    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, (float) n_past, 1, 0, };

    const int ret = llama_decode_internal(*ctx, batch);
    if (ret < 0) {
@@ -54,7 +54,7 @@ extern "C" {
    struct llama_model;
    struct llama_context;

-    typedef int32_t llama_pos;
+    typedef float   llama_pos;
    typedef int32_t llama_token;
    typedef int32_t llama_seq_id;

@@ -531,7 +531,7 @@ extern "C" {
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
-                             int   d);
+                           float   d);

    //
    // State / sessions
@@ -1134,14 +1134,15 @@ struct test_rope : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
+        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[2]);
+        ggml_set_name(pos, "pos");
        ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
        return out;
    }

    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
+            if (strcmp(ggml_get_name(t), "pos") == 0) {
                // pos
                std::vector<int> data(ne[2]);
                for (int i = 0; i < ne[2]; i++) {
@@ -1703,7 +1704,7 @@ struct test_llama : public test_llm {
        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);

        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_tokens);

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
@@ -1825,7 +1826,7 @@ struct test_falcon : public test_llm {
        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);

        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_tokens);

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
@@ -1449,9 +1449,9 @@ int main(int argc, const char ** argv) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);

-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne2[2]);
                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
+                            ((float *) p->data)[i] = n_past + i;
                        }

                        ggml_set_param(ctx0, x[0]);
@@ -1489,9 +1489,9 @@ int main(int argc, const char ** argv) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);

-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne2[2]);
                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
+                            ((float *) p->data)[i] = n_past + i;
                        }

                        ggml_set_param(ctx0, x[0]);
@@ -143,10 +143,10 @@ int main(int argc, char * argv[]) {
            continue;
        }

-        printf("Testing %s\n", ggml_type_name((ggml_type) i));
-        ggml_quantize_init(ei);
+        if (qfns.from_float && qfns.to_float && qfns.vec_dot) {
+            printf("Testing %s\n", ggml_type_name((ggml_type) i));
+            ggml_quantize_init(ei);

-        if (qfns.from_float && qfns.to_float) {
            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
            const float max_quantization_error =
                type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
@@ -275,7 +275,7 @@ int main(int argc, char * argv[]) {
            continue;
        }

-        if (qfns.from_float && qfns.to_float) {
+        if (qfns.from_float && qfns.to_float && qfns.vec_dot) {
            printf("%s\n", ggml_type_name(type));

            ggml_quantize_init(type);
@@ -146,14 +146,14 @@ int main(int /*argc*/, const char ** /*argv*/) {
        const int n_past_0 = 100;
        const int n_past_2 = 33;

-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne[2]);
+        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne[2]);
+        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne[2]);

        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
+            ((float *) p0->data)[i] = n_past_0 + i;
+            ((float *) p1->data)[i] = n_past_2 - n_past_0;
+            ((float *) p2->data)[i] = n_past_2 + i;
        }

        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
Author	SHA1	Message	Date
Georgi Gerganov	608f449880	swift : fix build ggml-ci	2024-02-23 19:02:09 +02:00
Georgi Gerganov	fff1e8a54a	batched.swift : fix build ggml-ci	2024-02-23 16:15:37 +02:00
Georgi Gerganov	8772658b11	ggml : add I32 <-> F32 conversion ggml-ci	2024-02-23 14:25:05 +02:00
Georgi Gerganov	fc775366f1	llama : switch to floating-point token positions ggml-ci	2024-02-23 12:34:16 +02:00
Jared Van Bortel	15499eb942	mpt : do not duplicate token_embd.weight on disk (#5670 )	2024-02-22 17:05:23 -05:00
Georgi Gerganov	96633eeca1	gemma : use more bits for the token_embd.weight tensor (#5650 ) * gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type	2024-02-22 23:23:46 +02:00
Georgi Gerganov	847eedbdb2	py : add Gemma conversion from HF models (#5647 ) * py : add gemma conversion from HF models * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela <akx@iki.fi> * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela <akx@iki.fi> * Update convert-hf-to-gguf.py Co-authored-by: Jared Van Bortel <jared@nomic.ai> --------- Co-authored-by: Aarni Koskela <akx@iki.fi> Co-authored-by: Jared Van Bortel <jared@nomic.ai>	2024-02-22 23:22:48 +02:00
Georgi Gerganov	7e4f339c40	ggml : always define ggml_fp16_t as uint16_t (#5666 ) * ggml : always define ggml_fp16_t as uint16_t ggml-ci * ggml : cont ggml-ci * ggml : cont * ggml : cont ggml-ci * ggml : cont ggml-ci * cuda : no longer ggml headers last ggml-ci * ggml : fix q6_K FP16 -> FP32 conversion ggml-ci * ggml : more FP16 -> FP32 conversion fixes ggml-ci	2024-02-22 23:21:39 +02:00