Compare commits

..

7 Commits

Author SHA1 Message Date
Bartowski e74c92e842 model : support GLM 4.6 (make a few NextN/MTP tensors not required) (#16359)
* Make a few GLM tensors not required

layer.nextn.shared_head_head and layer.nextn.embed_tokens are both excluded from GLM 4.6 resulting in the model not loading after conversion/quantization, this marks those tensors as not required which makes it work

* Update llama-model.cpp

layer.nextn.shared_head_norm also not required in case of future models
2025-09-30 22:24:36 +02:00
Sigbjørn Skjæret b2ba81dbe0 ci : fix ccache key for ubuntu-cpu-cmake (#16355)
* fix ccache key for ubuntu-cpu-cmake

* set it for release as well [no ci]
2025-09-30 21:41:42 +02:00
Adrien Gallouët bf6f3b3a19 common : disable progress bar without a tty (#16352)
* common : disable progress bar without a tty

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add missing headers

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-09-30 20:52:41 +03:00
lhez 7c156df414 opencl: support pad_ext (#15888) 2025-09-30 10:45:45 -07:00
Pascal 16b0ca0d2e Chatapi ignore empty sampling (#16330)
* fix: skip empty sampling fields instead of coercing to 0 in chat API options

* chore: update webui build output
2025-09-30 19:18:54 +02:00
Reese Levine 8d78cd2613 ggml webgpu: support for rope,div,sub,glu,scale,cont operators (#16187)
* Work on rope

* Simplify inplace operation generation and combine mul/add generation

* Work on rope variants

* implement neox rope

* rope complete

* Add sub,div,glu operators

* implement scale op

* Update cpy shader to handle cont/more types

* formatting

* Update test vars printing for rope,rms_norm

* Avoid ROPE hardcoded constants

* Add TODO to change ROPE constants to enum

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix TODO comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-09-30 09:57:51 -07:00
lhez d1c84a662d opencl: support ne3 in get_rows (#15866) 2025-09-30 09:55:13 -07:00
26 changed files with 1721 additions and 491 deletions
+1 -1
View File
@@ -207,7 +207,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-cpu-cmake
key: ubuntu-cpu-cmake-${{ matrix.build }}
evict-old-files: 1d
- name: Build Dependencies
+1 -1
View File
@@ -150,7 +150,7 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-cpu-cmake
key: ubuntu-cpu-cmake-${{ matrix.build }}
evict-old-files: 1d
- name: Dependencies
+20 -1
View File
@@ -54,6 +54,13 @@
#endif
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
// isatty
#if defined(_WIN32)
#include <io.h>
#else
#include <unistd.h>
#endif
using json = nlohmann::ordered_json;
std::initializer_list<enum llama_example> mmproj_examples = {
@@ -100,6 +107,14 @@ static void write_file(const std::string & fname, const std::string & content) {
}
}
static bool is_output_a_tty() {
#if defined(_WIN32)
return _isatty(_fileno(stdout));
#else
return isatty(1);
#endif
}
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = std::move(examples);
return *this;
@@ -652,7 +667,11 @@ static std::string show_masked_url(const common_url & parts) {
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
}
static void print_progress(size_t current, size_t total) { // TODO isatty
static void print_progress(size_t current, size_t total) {
if (!is_output_a_tty()) {
return;
}
if (!total) {
return;
}
+2
View File
@@ -237,6 +237,8 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
#define GGML_ROPE_TYPE_NORMAL 0
#define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
+74 -32
View File
@@ -2889,10 +2889,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_REPEAT:
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
case GGML_OP_PAD:
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
op->src[0]->ne[3] == 1 && op->ne[3] == 1 &&
(ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_CONV_2D:
@@ -4222,15 +4219,19 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0 ? src0->ne[0] : 0;
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const int ne10 = src1 ? src1->ne[0] : 0;
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0;
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
const int ne00 = src0->ne[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const cl_ulong nb10 = src1->nb[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -4267,14 +4268,17 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
size_t local_work_size[] = {1, 1, 1};
size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
size_t local_work_size[] = {64, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
@@ -5874,7 +5878,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
GGML_ASSERT(dst->extra);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -5892,28 +5895,67 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
const int s_ne0 = src0->ne[0];
const int s_ne1 = src0->ne[1];
const int s_ne2 = src0->ne[2];
const int s_ne3 = src0->ne[3];
const int s_nb0 = src0->nb[0];
const int s_nb1 = src0->nb[1];
const int s_nb2 = src0->nb[2];
const int s_nb3 = src0->nb[3];
const int d_ne0 = dst->ne[0];
const int d_ne1 = dst->ne[1];
const int d_ne2 = dst->ne[2];
const int d_ne3 = dst->ne[3];
const int d_nb0 = dst->nb[0];
const int d_nb1 = dst->nb[1];
const int d_nb2 = dst->nb[2];
const int d_nb3 = dst->nb[3];
const int lp0 = ((const int*)(dst->op_params))[0];
const int rp0 = ((const int*)(dst->op_params))[1];
const int lp1 = ((const int*)(dst->op_params))[2];
const int rp1 = ((const int*)(dst->op_params))[3];
const int lp2 = ((const int*)(dst->op_params))[4];
const int rp2 = ((const int*)(dst->op_params))[5];
const int lp3 = ((const int*)(dst->op_params))[6];
const int rp3 = ((const int*)(dst->op_params))[7];
cl_kernel kernel = backend_ctx->kernel_pad;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &s_ne3));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &s_nb0));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &s_nb1));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &s_nb2));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &s_nb3));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &d_ne3));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &d_nb0));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &d_nb1));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &d_nb2));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &d_nb3));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &lp0));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &rp0));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &lp1));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &rp1));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &lp2));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &rp2));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &lp3));
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &rp3));
size_t lws0 = 64;
size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2*d_ne3 };
size_t local_work_size[] = { lws0, 1, 1 };
size_t * local_work_size_ptr = local_work_size;
+36 -12
View File
@@ -69,11 +69,14 @@ kernel void kernel_get_rows_f32(
int ne00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb1,
ulong nb2
ulong nb2,
ulong nb3
) {
src0 = (global void*)((global char*)src0 + offset0);
src1 = (global int*)((global char*)src1 + offset1);
@@ -81,14 +84,19 @@ kernel void kernel_get_rows_f32(
int i10 = get_group_id(0);
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int r = ((global int *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
int i02 = i11;
int i03 = i12;
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
((global float *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
if (ind >= ne00) {
return;
}
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
}
}
@@ -102,11 +110,14 @@ kernel void kernel_get_rows_f16(
int ne00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb1,
ulong nb2
ulong nb2,
ulong nb3
) {
src0 = (global void*)((global char*)src0 + offset0);
src1 = (global int*)((global char*)src1 + offset1);
@@ -114,14 +125,19 @@ kernel void kernel_get_rows_f16(
int i10 = get_group_id(0);
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
int r = ((global int32_t *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
int i02 = i11;
int i03 = i12;
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
((global half *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
if (ind >= ne00) {
return;
}
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
((global half *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
}
}
@@ -135,11 +151,14 @@ kernel void kernel_get_rows_q4_0(
int ne00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb1,
ulong nb2
ulong nb2,
ulong nb3
) {
src0 = (global void*)((global char*)src0 + offset0);
src1 = (global int*)((global char*)src1 + offset1);
@@ -149,15 +168,20 @@ kernel void kernel_get_rows_q4_0(
int i10 = get_group_id(0);
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
int r = ((global int32_t *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
int i02 = i11;
int i03 = i12;
for (int ind = get_local_id(0); ind < ne00/16; ind += get_local_size(0)) {
float16 temp;
if (ind >= ne00) {
return;
}
dequantize_q4_0_f32(
((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02)) + ind/NL, ind%NL, &temp);
*(((global float16 *) ((global char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03)) + ind/NL, ind%NL, &temp);
*(((global float16 *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1)) + ind) = temp;
}
}
+29 -20
View File
@@ -1,30 +1,39 @@
kernel void kernel_pad(
global const void * src0_ptr,
ulong src0_offset,
global void * dst_ptr,
ulong dst_offset,
int s_ne0, int s_ne1, int s_ne2,
int d_ne0, int d_ne1, int d_ne2
global void * src0,
ulong offset0,
global void * dst,
ulong offsetd,
int ne00, int ne01, int ne02, int ne03,
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
int ne0, int ne1, int ne2, int ne3,
ulong nb0, ulong nb1, ulong nb2, ulong nb3,
int lp0, int rp0,
int lp1, int rp1,
int lp2, int rp2,
int lp3, int rp3
) {
global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset);
global float * dst = (global float *)((global char *)dst_ptr + dst_offset);
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
int nidx = get_global_id(0);
int idx_d1 = get_group_id(1);
int idx_d2 = get_group_id(2);
int i0 = get_global_id(0);
int i1 = get_group_id(1);
int i2 = get_group_id(2) % ne2;
int i3 = get_group_id(2) / ne2;
if (nidx >= d_ne0) {
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1;
uint src0_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
uint dst_idx = i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0;
bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2);
global float * src0_ptr = (global float *)((global char *)src0 + src0_idx);
global float * dst_ptr = (global float *)((global char *)dst + dst_idx);
if (in_src_bounds) {
int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1;
dst[dst_el_offset] = src0[src_el_offset];
} else {
dst[dst_el_offset] = 0.0f;
}
bool in_src_bounds = (i0 >= lp0 && i0 < ne0 - rp0) &&
(i1 >= lp1 && i1 < ne1 - rp1) &&
(i2 >= lp2 && i2 < ne2 - rp2) &&
(i3 >= lp3 && i3 < ne3 - rp3);
*dst_ptr = in_src_bounds ? *src0_ptr : 0.0f;
}
+414 -74
View File
@@ -130,13 +130,15 @@ struct webgpu_context_struct {
wgpu::ComputePipeline set_rows_pipeline;
wgpu::ComputePipeline get_rows_pipeline[30];
wgpu::ComputePipeline get_rows_f32_no_vec_pipeline;
wgpu::ComputePipeline cpy_pipeline;
wgpu::ComputePipeline add_pipeline[2];
wgpu::ComputePipeline add_ip_pipeline[2];
wgpu::ComputePipeline mul_pipeline[2];
wgpu::ComputePipeline mul_ip_pipeline[2];
wgpu::ComputePipeline rms_norm_pipeline;
wgpu::ComputePipeline rms_norm_ip_pipeline;
wgpu::ComputePipeline cpy_pipeline[2][2]; // src type, dst type
wgpu::ComputePipeline add_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline sub_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline mul_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline div_pipeline[2][2]; // type, inplace
wgpu::ComputePipeline rms_norm_pipeline[2]; // inplace
wgpu::ComputePipeline rope_pipeline[2][2][2]; // type, ff, inplace
wgpu::ComputePipeline glu_pipeline[7][2][2]; // glu-op, type, split
wgpu::ComputePipeline scale_pipeline[2]; // inplace
size_t memset_bytes_per_thread;
@@ -489,8 +491,9 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
(uint32_t) (src->nb[2] / ggml_type_size(src->type)), (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
(uint32_t) (dst->nb[0] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
// Logical shape — same for both tensors even if permuted
(uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3]
// Logical shapes
(uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) dst->ne[0],
(uint32_t) dst->ne[1], (uint32_t) dst->ne[2]
};
std::vector<wgpu::BindGroupEntry> entries = {
@@ -506,7 +509,8 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
size_t max_wg_size = ctx->max_wg_size_x;
uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size;
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline, params, entries, wg_x, ggml_op_name(dst->op));
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x,
ggml_op_name(dst->op));
}
static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
@@ -649,7 +653,7 @@ static void ggml_webgpu_binary_op(webgpu_context & ctx,
ggml_tensor * src1,
ggml_tensor * dst,
wgpu::ComputePipeline & pipeline,
bool in_place) {
bool inplace) {
std::vector<uint32_t> params = {
(uint32_t) ggml_nelements(dst),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
@@ -678,7 +682,7 @@ static void ggml_webgpu_binary_op(webgpu_context & ctx,
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
.size = ggml_webgpu_tensor_binding_size(ctx, src1) }
};
if (!in_place) {
if (!inplace) {
entries.push_back({ .binding = 2,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
@@ -691,30 +695,23 @@ static void ggml_webgpu_binary_op(webgpu_context & ctx,
}
static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
bool in_place = ggml_webgpu_tensor_equal(src, dst);
uint32_t eps;
memcpy(&eps, dst->op_params, sizeof(float));
int inplace = ggml_webgpu_tensor_equal(src, dst);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (src->nb[1] / ggml_type_size(src->type)),
(uint32_t) (src->nb[2] / ggml_type_size(src->type)),
(uint32_t) (src->nb[3] / ggml_type_size(src->type)),
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
(uint32_t) src->ne[0],
(uint32_t) src->ne[1],
(uint32_t) src->ne[2],
(uint32_t) src->ne[3],
*(uint32_t *) dst->op_params // epsilon, treated as f32 in the shader
};
if (!in_place) {
params.push_back((uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)));
}
params.push_back((uint32_t) (src->nb[1] / ggml_type_size(src->type)));
params.push_back((uint32_t) (src->nb[2] / ggml_type_size(src->type)));
params.push_back((uint32_t) (src->nb[3] / ggml_type_size(src->type)));
if (!in_place) {
params.push_back((uint32_t) (dst->nb[1] / ggml_type_size(dst->type)));
params.push_back((uint32_t) (dst->nb[2] / ggml_type_size(dst->type)));
params.push_back((uint32_t) (dst->nb[3] / ggml_type_size(dst->type)));
}
params.push_back((uint32_t) src->ne[0]);
params.push_back((uint32_t) src->ne[1]);
params.push_back((uint32_t) src->ne[2]);
params.push_back((uint32_t) src->ne[3]);
params.push_back(eps); // epsilon, will be bitcast to float in shader
std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0,
@@ -722,24 +719,199 @@ static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_t
.offset = ggml_webgpu_tensor_align_offset(ctx, src),
.size = ggml_webgpu_tensor_binding_size(ctx, src) }
};
if (!in_place) {
if (!inplace) {
entries.push_back({ .binding = 1,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
}
wgpu::ComputePipeline pipeline;
if (in_place) {
pipeline = ctx->rms_norm_ip_pipeline;
} else {
pipeline = ctx->rms_norm_pipeline;
}
size_t max_wg_size = ctx->max_wg_size_x;
uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->rms_norm_pipeline[inplace], params, entries, wg_x,
ggml_op_name(dst->op));
}
static void ggml_webgpu_rope(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
ggml_tensor * src2,
ggml_tensor * dst) {
const int inplace = ggml_webgpu_tensor_equal(src0, dst);
const int has_freq_factor = (src2 != nullptr);
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
int sections[4];
memcpy(sections, (int32_t *) dst->op_params + 11, 4 * sizeof(int));
float theta_scale = powf(freq_base, -2.0f / n_dims);
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
src2 != nullptr ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)) : 0,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
(uint32_t) ggml_nelements(src0) / 2,
(uint32_t) src0->ne[0],
(uint32_t) src0->ne[1],
(uint32_t) src0->ne[2],
(uint32_t) n_dims,
(uint32_t) mode,
*(uint32_t *) &theta_scale,
*(uint32_t *) &attn_factor,
*(uint32_t *) &freq_scale,
*(uint32_t *) &ext_factor,
*(uint32_t *) &corr_dims[0],
*(uint32_t *) &corr_dims[1],
(uint32_t) sections[0],
(uint32_t) sections[1],
(uint32_t) sections[2],
(uint32_t) sections[3]
};
std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
.size = ggml_webgpu_tensor_binding_size(ctx, src0) },
{ .binding = 1,
.buffer = ggml_webgpu_tensor_buf(src1),
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
.size = ggml_webgpu_tensor_binding_size(ctx, src1) }
};
uint32_t dst_binding = 2;
if (has_freq_factor) {
dst_binding = 3;
entries.push_back({ .binding = 2,
.buffer = ggml_webgpu_tensor_buf(src2),
.offset = ggml_webgpu_tensor_align_offset(ctx, src2),
.size = ggml_webgpu_tensor_binding_size(ctx, src2) });
}
if (!inplace) {
entries.push_back({ .binding = dst_binding,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
}
wgpu::ComputePipeline pipeline = ctx->rope_pipeline[dst->type][has_freq_factor][inplace];
size_t max_wg_size = ctx->max_wg_size_x;
uint32_t wg_x = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
}
static void ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
const int split = (src1 != nullptr);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
src1 != nullptr ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)) : 0,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
src1 != nullptr ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) :
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
src1 != nullptr ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
src1 != nullptr ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
(uint32_t) ggml_nelements(dst),
(uint32_t) dst->ne[0],
(uint32_t) dst->ne[1],
(uint32_t) dst->ne[2],
(uint32_t) ((int32_t *) dst->op_params)[1], // swapped
*(uint32_t *) &dst->op_params[2], // alpha, for swiglu_oai
*(uint32_t *) &dst->op_params[3], // limit, for swiglu_oai
};
std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
.size = ggml_webgpu_tensor_binding_size(ctx, src0) },
};
uint32_t dst_binding = 1;
if (split) {
dst_binding = 2;
entries.push_back({ .binding = 1,
.buffer = ggml_webgpu_tensor_buf(src1),
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
.size = ggml_webgpu_tensor_binding_size(ctx, src1) });
}
entries.push_back({ .binding = dst_binding,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
wgpu::ComputePipeline pipeline = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split];
size_t max_wg_size = ctx->max_wg_size_x;
uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
}
static void ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
int inplace = ggml_webgpu_tensor_equal(src, dst);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (src->nb[1] / ggml_type_size(src->type)),
(uint32_t) (src->nb[2] / ggml_type_size(src->type)),
(uint32_t) (src->nb[3] / ggml_type_size(src->type)),
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
(uint32_t) ggml_nelements(dst),
(uint32_t) src->ne[0],
(uint32_t) src->ne[1],
(uint32_t) src->ne[2],
*(uint32_t *) dst->op_params, // scale
*(uint32_t *) &dst->op_params[1] // bias
};
std::vector<wgpu::BindGroupEntry> entries = {
{ .binding = 0,
.buffer = ggml_webgpu_tensor_buf(src),
.offset = ggml_webgpu_tensor_align_offset(ctx, src),
.size = ggml_webgpu_tensor_binding_size(ctx, src) }
};
if (!inplace) {
entries.push_back({ .binding = 1,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
}
size_t max_wg_size = ctx->max_wg_size_x;
uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->scale_pipeline[inplace], params, entries, wg_x,
ggml_op_name(dst->op));
}
// Returns true if node has enqueued work into the queue, false otherwise
static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
if (ggml_is_empty(node)) {
@@ -749,6 +921,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
ggml_tensor * src0 = node->src[0];
ggml_tensor * src1 = node->src[1];
ggml_tensor * src2 = node->src[2];
switch (node->op) {
// no-ops
@@ -759,6 +932,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
case GGML_OP_RESHAPE:
return false;
case GGML_OP_CPY:
case GGML_OP_CONT:
ggml_webgpu_cpy(ctx, src0, node);
break;
case GGML_OP_SET_ROWS:
@@ -771,22 +945,41 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
ggml_webgpu_mul_mat(ctx, src0, src1, node);
break;
case GGML_OP_ADD:
if (ggml_webgpu_tensor_equal(src0, node)) {
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_ip_pipeline[node->type], true);
} else {
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type], false);
{
int inplace = ggml_webgpu_tensor_equal(src0, node);
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace);
break;
}
case GGML_OP_SUB:
{
int inplace = ggml_webgpu_tensor_equal(src0, node);
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace);
break;
}
break;
case GGML_OP_MUL:
if (ggml_webgpu_tensor_equal(src0, node)) {
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_ip_pipeline[node->type], true);
} else {
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type], false);
{
int inplace = ggml_webgpu_tensor_equal(src0, node);
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace);
break;
}
case GGML_OP_DIV:
{
int inplace = ggml_webgpu_tensor_equal(src0, node);
ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace);
break;
}
break;
case GGML_OP_RMS_NORM:
ggml_webgpu_rms_norm(ctx, src0, node);
break;
case GGML_OP_ROPE:
ggml_webgpu_rope(ctx, src0, src1, src2, node);
break;
case GGML_OP_GLU:
ggml_webgpu_glu(ctx, src0, src1, node);
break;
case GGML_OP_SCALE:
ggml_webgpu_scale(ctx, src0, node);
break;
default:
return false;
}
@@ -1170,40 +1363,153 @@ static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) {
}
static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy",
ggml_webgpu_max_wg_size_entry(webgpu_ctx));
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
wgsl_cpy_f32_f32, "cpy_f32_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F32][GGML_TYPE_F16],
wgsl_cpy_f32_f16, "cpy_f32_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
wgsl_cpy_f16_f32, "cpy_f16_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
wgsl_cpy_f16_f16, "cpy_f16_f16", constants);
}
static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32], wgsl_add_f32, "add_f32",
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0], wgsl_add_f32, "add_f32",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16], wgsl_add_f16, "add_f16",
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0], wgsl_add_f16, "add_f16",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F32], wgsl_add_in_place_f32,
"add_in_place_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_ip_pipeline[GGML_TYPE_F16], wgsl_add_in_place_f16,
"add_in_place_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1], wgsl_add_f32_inplace,
"add_f32_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1], wgsl_add_f16_inplace,
"add_f16_inplace", constants);
}
static void ggml_webgpu_init_sub_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0], wgsl_sub_f32, "sub_f32",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0], wgsl_sub_f16, "sub_f16",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1], wgsl_sub_f32_inplace,
"sub_f32_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1], wgsl_sub_f16_inplace,
"sub_f16_inplace", constants);
}
static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32], wgsl_mul_f32, "mul_f32",
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0], wgsl_mul_f32, "mul_f32",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16], wgsl_mul_f16, "mul_f16",
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0], wgsl_mul_f16, "mul_f16",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F32], wgsl_mul_in_place_f32,
"mul_in_place_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_ip_pipeline[GGML_TYPE_F16], wgsl_mul_in_place_f16,
"mul_in_place_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1], wgsl_mul_f32_inplace,
"mul_f32_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1], wgsl_mul_f16_inplace,
"mul_f16_inplace", constants);
}
static void ggml_webgpu_init_div_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0], wgsl_div_f32, "div_f32",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0], wgsl_div_f16, "div_f16",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1], wgsl_div_f32_inplace,
"div_f32_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1], wgsl_div_f16_inplace,
"div_f16_inplace", constants);
}
static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline, wgsl_rms_norm, "rms_norm",
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline[0], wgsl_rms_norm, "rms_norm",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_ip_pipeline, wgsl_rms_norm_in_place,
"rms_norm_in_place", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline[1], wgsl_rms_norm_inplace,
"rms_norm_inplace", constants);
}
static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][0], wgsl_rope_f32,
"rope_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][1],
wgsl_rope_f32_inplace, "rope_f32_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][1][0], wgsl_rope_f32_ff,
"rope_f32_ff", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][1][1],
wgsl_rope_f32_ff_inplace, "rope_f32_ff_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][0], wgsl_rope_f16,
"rope_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][1],
wgsl_rope_f16_inplace, "rope_f16_inplace", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][1][0], wgsl_rope_f16_ff,
"rope_f16_ff", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][1][1],
wgsl_rope_f16_ff_inplace, "rope_f16_ff_inplace", constants);
}
static void ggml_webgpu_init_glu_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
// reglu
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F32][0],
wgsl_reglu_f32, "reglu_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F16][0],
wgsl_reglu_f16, "reglu_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F32][1],
wgsl_reglu_f32_split, "reglu_f32_split", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F16][1],
wgsl_reglu_f16_split, "reglu_f16_split", constants);
// geglu
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F32][0],
wgsl_geglu_f32, "geglu_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F16][0],
wgsl_geglu_f16, "geglu_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F32][1],
wgsl_geglu_f32_split, "geglu_f32_split", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F16][1],
wgsl_geglu_f16_split, "geglu_f16_split", constants);
// swiglu
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F32][0],
wgsl_swiglu_f32, "swiglu_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F16][0],
wgsl_swiglu_f16, "swiglu_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F32][1],
wgsl_swiglu_f32_split, "swiglu_f32_split", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F16][1],
wgsl_swiglu_f16_split, "swiglu_f16_split", constants);
// swiglu_oai
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU_OAI][GGML_TYPE_F32][0],
wgsl_swiglu_oai_f32, "swiglu_oai_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU_OAI][GGML_TYPE_F32][1],
wgsl_swiglu_oai_f32_split, "swiglu_oai_f32_split", constants);
// geglu_erf
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F32][0],
wgsl_geglu_erf_f32, "geglu_erf_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F16][0],
wgsl_geglu_erf_f16, "geglu_erf_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F32][1],
wgsl_geglu_erf_f32_split, "geglu_erf_f32_split", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F16][1],
wgsl_geglu_erf_f16_split, "geglu_erf_f16_split", constants);
// geglu_quick
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F32][0],
wgsl_geglu_quick_f32, "geglu_quick_f32", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F16][0],
wgsl_geglu_quick_f16, "geglu_quick_f16", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F32][1],
wgsl_geglu_quick_f32_split, "geglu_quick_f32_split", constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F16][1],
wgsl_geglu_quick_f16_split, "geglu_quick_f16_split", constants);
}
static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_max_wg_size_entry(webgpu_ctx);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->scale_pipeline[0], wgsl_scale_f32, "scale_f32",
constants);
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->scale_pipeline[1], wgsl_scale_f32_inplace,
"scale_f32_inplace", constants);
}
static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
@@ -1287,6 +1593,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
ggml_tensor * src0 = op->src[0];
ggml_tensor * src1 = op->src[1];
// on smaller devices (or CI), tensors may be larger than the max storage buffer size
if (ggml_nbytes(op) > webgpu_ctx->limits.maxStorageBufferBindingSize ||
(src0 != nullptr && ggml_nbytes(src0) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
@@ -1304,28 +1611,34 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
supports_op = true;
break;
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (op->src[0]->type == op->type) &&
(op->src[1]->type == op->type);
case GGML_OP_DIV:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) &&
(src1->type == op->type);
break;
case GGML_OP_CPY:
case GGML_OP_CONT:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
break;
case GGML_OP_SET_ROWS:
supports_op = (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64);
break;
case GGML_OP_GET_ROWS:
if (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16 ||
op->src[0]->type == GGML_TYPE_I32 || ggml_webgpu_supported_qtype(op->src[0]->type)) {
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_I32 ||
ggml_webgpu_supported_qtype(src0->type)) {
supports_op = (op->type == GGML_TYPE_F32);
}
break;
case GGML_OP_MUL_MAT:
{
switch (op->src[1]->type) {
switch (src1->type) {
case GGML_TYPE_F16:
supports_op = (op->src[0]->type == GGML_TYPE_F16);
supports_op |= (src0->type == GGML_TYPE_F16);
break;
case GGML_TYPE_F32:
switch (op->src[0]->type) {
switch (src0->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
@@ -1358,7 +1671,29 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
break;
}
case GGML_OP_RMS_NORM:
supports_op = op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
break;
case GGML_OP_ROPE:
supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
break;
case GGML_OP_GLU:
switch (ggml_get_glu_op(op)) {
case GGML_GLU_OP_REGLU:
case GGML_GLU_OP_GEGLU:
case GGML_GLU_OP_SWIGLU:
case GGML_GLU_OP_GEGLU_ERF:
case GGML_GLU_OP_GEGLU_QUICK:
supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
break;
case GGML_GLU_OP_SWIGLU_OAI:
supports_op = op->type == GGML_TYPE_F32;
break;
default:
break;
}
break;
case GGML_OP_SCALE:
supports_op = op->type == GGML_TYPE_F32;
break;
default:
break;
@@ -1484,8 +1819,13 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
ggml_webgpu_init_get_rows_pipeline(ctx);
ggml_webgpu_init_cpy_pipeline(ctx);
ggml_webgpu_init_add_pipeline(ctx);
ggml_webgpu_init_sub_pipeline(ctx);
ggml_webgpu_init_mul_pipeline(ctx);
ggml_webgpu_init_div_pipeline(ctx);
ggml_webgpu_init_rms_norm_pipeline(ctx);
ggml_webgpu_init_rope_pipeline(ctx);
ggml_webgpu_init_glu_pipeline(ctx);
ggml_webgpu_init_scale_pipeline(ctx);
#ifdef GGML_WEBGPU_DEBUG
// Initialize debug buffers
@@ -1,44 +0,0 @@
#define(VARIANTS)
[
{
"REPLS": {
"TYPE" : "f32",
}
},
{
"REPLS": {
"TYPE" : "f16",
}
}
]
#end(VARIANTS)
#define(SHADER)
enable f16;
#include "binary_head.tmpl"
@group(0) @binding(0)
var<storage, read_write> src0: array<{{TYPE}}>;
@group(0) @binding(1)
var<storage, read_write> src1: array<{{TYPE}}>;
@group(0) @binding(2)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(3)
var<uniform> params: Params;
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x < params.ne) {
dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_index(gid.x)];
}
}
#end(SHADER)
@@ -1,41 +0,0 @@
#define(VARIANTS)
[
{
"REPLS": {
"TYPE" : "f32",
}
},
{
"REPLS": {
"TYPE" : "f16",
}
}
]
#end(VARIANTS)
#define(SHADER)
enable f16;
#include "binary_head.tmpl"
@group(0) @binding(0)
var<storage, read_write> src0: array<{{TYPE}}>;
@group(0) @binding(1)
var<storage, read_write> src1: array<{{TYPE}}>;
@group(0) @binding(2)
var<uniform> params: Params;
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x < params.ne) {
src0[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_index(gid.x)];
}
}
#end(SHADER)
@@ -0,0 +1,188 @@
#define(VARIANTS)
[
{
"SHADER_NAME": "add_f32",
"REPLS": {
"TYPE" : "f32",
"OP": "+"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "add_f16",
"REPLS": {
"TYPE" : "f16",
"OP": "+"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "add_f32_inplace",
"REPLS": {
"TYPE" : "f32",
"OP": "+"
},
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "add_f16_inplace",
"REPLS": {
"TYPE" : "f16",
"OP": "+"
},
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "mul_f32",
"REPLS": {
"TYPE" : "f32",
"OP": "*"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "mul_f16",
"REPLS": {
"TYPE" : "f16",
"OP": "*"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "mul_f32_inplace",
"REPLS": {
"TYPE" : "f32",
"OP": "*"
},
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "mul_f16_inplace",
"REPLS": {
"TYPE" : "f16",
"OP": "*"
},
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "sub_f32",
"REPLS": {
"TYPE" : "f32",
"OP": "-"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "sub_f16",
"REPLS": {
"TYPE" : "f16",
"OP": "-"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "sub_f32_inplace",
"REPLS": {
"TYPE" : "f32",
"OP": "-"
},
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "sub_f16_inplace",
"REPLS": {
"TYPE" : "f16",
"OP": "-"
},
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "div_f32",
"REPLS": {
"TYPE" : "f32",
"OP": "/"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "div_f16",
"REPLS": {
"TYPE" : "f16",
"OP": "/"
},
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "div_f32_inplace",
"REPLS": {
"TYPE" : "f32",
"OP": "/"
},
"DECLS": ["INPLACE"]
},
{
"SHADER_NAME": "div_f16_inplace",
"REPLS": {
"TYPE" : "f16",
"OP": "/"
},
"DECLS": ["INPLACE"]
}
]
#end(VARIANTS)
#define(DECLS)
#decl(NOT_INPLACE)
fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
dst[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
}
@group(0) @binding(2)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(3)
var<uniform> params: Params;
#enddecl(NOT_INPLACE)
#decl(INPLACE)
fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
src0[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
}
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(INPLACE)
#end(DECLS)
#define(SHADER)
enable f16;
#include "binary_head.tmpl"
@group(0) @binding(0)
var<storage, read_write> src0: array<{{TYPE}}>;
@group(0) @binding(1)
var<storage, read_write> src1: array<{{TYPE}}>;
DECLS
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x < params.ne) {
update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
}
}
#end(SHADER)
@@ -0,0 +1,101 @@
#define(VARIANTS)
[
{
"REPLS": {
"SRC_TYPE": "f32",
"DST_TYPE": "f32"
}
},
{
"REPLS": {
"SRC_TYPE": "f32",
"DST_TYPE": "f16"
}
},
{
"REPLS": {
"SRC_TYPE": "f16",
"DST_TYPE": "f16"
}
},
{
"REPLS": {
"SRC_TYPE": "f16",
"DST_TYPE": "f32"
}
}
]
#end(VARIANTS)
#define(SHADER)
enable f16;
@group(0) @binding(0)
var<storage, read_write> src: array<{{SRC_TYPE}}>;
@group(0) @binding(1)
var<storage, read_write> dst: array<{{DST_TYPE}}>;
struct Params {
ne: u32, // total number of elements
offset_src: u32, // in elements
offset_dst: u32, // in elements
// Strides (in elements) — may be permuted
stride_src0: u32,
stride_src1: u32,
stride_src2: u32,
stride_src3: u32,
stride_dst0: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
// Logical shapes
src_ne0: u32,
src_ne1: u32,
src_ne2: u32,
dst_ne0: u32,
dst_ne1: u32,
dst_ne2: u32
};
@group(0) @binding(2)
var<uniform> params: Params;
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x >= params.ne) {
return;
}
var i = gid.x;
let i3 = i / (params.src_ne2 * params.src_ne1 * params.src_ne0);
i = i % (params.src_ne2 * params.src_ne1 * params.src_ne0);
let i2 = i / (params.src_ne1 * params.src_ne0);
i = i % (params.src_ne1 * params.src_ne0);
let i1 = i / params.src_ne0;
let i0 = i % params.src_ne0;
var j = gid.x;
let j3 = j / (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
j = j % (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
let j2 = j / (params.dst_ne1 * params.dst_ne0);
j = j % (params.dst_ne1 * params.dst_ne0);
let j1 = j / params.dst_ne0;
let j0 = j % params.dst_ne0;
let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
i2 * params.stride_src2 + i3 * params.stride_src3;
let dst_idx = j0 * params.stride_dst0 + j1 * params.stride_dst1 +
j2 * params.stride_dst2 + j3 * params.stride_dst3;
dst[params.offset_dst + dst_idx] = {{DST_TYPE}}((src[params.offset_src + src_idx]));
}
#end(SHADER)
@@ -1,60 +0,0 @@
enable f16;
@group(0) @binding(0)
var<storage, read_write> src: array<f32>;
@group(0) @binding(1)
var<storage, read_write> dst: array<f16>;
struct Params {
ne: u32, // total number of elements
offset_src: u32, // in elements
offset_dst: u32, // in elements
// Strides (in elements) — may be permuted
stride_src0: u32,
stride_src1: u32,
stride_src2: u32,
stride_src3: u32,
stride_dst0: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
// Logical shape (same for both tensors)
ne0: u32,
ne1: u32,
ne2: u32,
ne3: u32,
};
@group(0) @binding(2)
var<uniform> params: Params;
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x >= params.ne) {
return;
}
var i = gid.x;
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
i = i % (params.ne2 * params.ne1 * params.ne0);
let i2 = i / (params.ne1 * params.ne0);
i = i % (params.ne1 * params.ne0);
let i1 = i / params.ne0;
let i0 = i % params.ne0;
let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
i2 * params.stride_src2 + i3 * params.stride_src3;
let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 +
i2 * params.stride_dst2 + i3 * params.stride_dst3;
dst[params.offset_dst + dst_idx] = f16(src[params.offset_src + src_idx]);
}
@@ -88,15 +88,20 @@ def generate_variants(fname, input_dir, output_dir, outfile):
raise ValueError(f"DECLS key '{key}' not found.")
decls_code += decls_map[key] + "\n\n"
shader_variant = replace_placeholders(shader_template, variant["REPLS"])
final_shader = re.sub(r'\bDECLS\b', decls_code, shader_variant)
final_shader = re.sub(r'\bDECLS\b', decls_code, shader_template)
if "REPLS" in variant:
final_shader = replace_placeholders(final_shader, variant["REPLS"])
final_shader = expand_includes(final_shader, input_dir)
if "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]:
if "SHADER_NAME" in variant:
output_name = variant["SHADER_NAME"]
elif "SHADER_SUFFIX" in variant:
output_name = f"{shader_base_name}_" + variant["SHADER_SUFFIX"]
elif "REPLS" in variant and "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]:
output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]])
elif "TYPE_SUFFIX" in variant["REPLS"]:
output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE_SUFFIX"]
elif "TYPE" in variant["REPLS"]:
elif "REPLS" in variant and "SRC_TYPE" in variant["REPLS"] and "DST_TYPE" in variant["REPLS"]:
output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC_TYPE"], variant["REPLS"]["DST_TYPE"]])
elif "REPLS" in variant and "TYPE" in variant["REPLS"]:
output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE"]
else:
output_name = shader_base_name
@@ -2,9 +2,9 @@
[
{
"SHADER_SUFFIX": "f32_vec",
"REPLS": {
"TYPE" : "vec4<f32>",
"TYPE_SUFFIX": "f32_vec",
"DST_TYPE": "vec4<f32>",
"BLOCK_SIZE": 4
},
@@ -0,0 +1,323 @@
#define(VARIANTS)
[
{
"SHADER_NAME": "reglu_f32",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_SPLIT", "REGLU"]
},
{
"SHADER_NAME": "reglu_f32_split",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["SPLIT", "REGLU"]
},
{
"SHADER_NAME": "reglu_f16",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["NO_SPLIT", "REGLU"]
},
{
"SHADER_NAME": "reglu_f16_split",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["SPLIT", "REGLU"]
},
{
"SHADER_NAME": "geglu_f32",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_SPLIT", "GEGLU"]
},
{
"SHADER_NAME": "geglu_f32_split",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["SPLIT", "GEGLU"]
},
{
"SHADER_NAME": "geglu_f16",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["NO_SPLIT", "GEGLU"]
},
{
"SHADER_NAME": "geglu_f16_split",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["SPLIT", "GEGLU"]
},
{
"SHADER_NAME": "swiglu_f32",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_SPLIT", "SWIGLU"]
},
{
"SHADER_NAME": "swiglu_f32_split",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["SPLIT", "SWIGLU"]
},
{
"SHADER_NAME": "swiglu_f16",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["NO_SPLIT", "SWIGLU"]
},
{
"SHADER_NAME": "swiglu_f16_split",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["SPLIT", "SWIGLU"]
},
{
"SHADER_NAME": "swiglu_oai_f32",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_SPLIT", "SWIGLU_OAI"]
},
{
"SHADER_NAME": "swiglu_oai_f32_split",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["SPLIT", "SWIGLU_OAI"]
},
{
"SHADER_NAME": "geglu_erf_f32",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_SPLIT", "GEGLU_ERF"]
},
{
"SHADER_NAME": "geglu_erf_f32_split",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["SPLIT", "GEGLU_ERF"]
},
{
"SHADER_NAME": "geglu_erf_f16",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["NO_SPLIT", "GEGLU_ERF"]
},
{
"SHADER_NAME": "geglu_erf_f16_split",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["SPLIT", "GEGLU_ERF"]
},
{
"SHADER_NAME": "geglu_quick_f32",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_SPLIT", "GEGLU_QUICK"]
},
{
"SHADER_NAME": "geglu_quick_f32_split",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["SPLIT", "GEGLU_QUICK"]
},
{
"SHADER_NAME": "geglu_quick_f16",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["NO_SPLIT", "GEGLU_QUICK"]
},
{
"SHADER_NAME": "geglu_quick_f16_split",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["SPLIT", "GEGLU_QUICK"]
},
]
#end(VARIANTS)
#define(DECLS)
#decl(REGLU)
fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} {
return max(a, 0) * b;
}
#enddecl(REGLU)
#decl(GEGLU)
const SQRT_2_OVER_PI: {{TYPE}} = 0.79788456080286535587989211986876;
const GELU_COEF_A: {{TYPE}} = 0.044715;
fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} {
let val = SQRT_2_OVER_PI * a * (1.0 + GELU_COEF_A * a * a);
return 0.5 * a * (2.0 - 2.0 / (exp(2 * val) + 1)) * b;
}
#enddecl(GEGLU)
#decl(SWIGLU)
fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} {
return a / (1.0 + exp(-a)) * b;
}
#enddecl(SWIGLU)
#decl(SWIGLU_OAI)
fn op(a: f32, b: f32) -> f32 {
let xi = min(a, params.limit);
let gi = max(min(b, params.limit), -params.limit);
var out_glu = xi / (1.0 + exp(-xi * params.alpha));
out_glu = out_glu * (1.0 + gi);
return out_glu;
}
#enddecl(SWIGLU_OAI)
#decl(GEGLU_ERF)
const p_erf: {{TYPE}} = 0.3275911;
const a1_erf: {{TYPE}} = 0.254829592;
const a2_erf: {{TYPE}} = -0.284496736;
const a3_erf: {{TYPE}} = 1.421413741;
const a4_erf: {{TYPE}} = -1.453152027;
const a5_erf: {{TYPE}} = 1.061405429;
const SQRT_2_INV: {{TYPE}} = 0.7071067811865476;
fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} {
let a_div_sqr2 = a * SQRT_2_INV;
let sign_x = sign(a_div_sqr2);
let x = abs(a_div_sqr2);
let t = 1.0 / (1.0 + p_erf * x);
let y = 1.0 - (((((a5_erf * t + a4_erf) * t + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x));
let erf_approx = sign_x * y;
return 0.5 * a * (1.0 + erf_approx) * b;
}
#enddecl(GEGLU_ERF)
#decl(GEGLU_QUICK)
const GELU_QUICK_COEF: {{TYPE}} = -1.702;
fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} {
return a * (1.0 / (1.0 + exp(GELU_QUICK_COEF * a))) * b;
}
#enddecl(GEGLU_QUICK)
#decl(NO_SPLIT)
@group(0) @binding(1)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(2)
var<uniform> params: Params;
fn a_value(base: u32) -> {{TYPE}} {
let offset: u32 = select(0, params.ne0, params.swapped != 0);
return src0[base + offset];
}
fn b_value(base: u32) -> {{TYPE}} {
let offset: u32 = select(params.ne0, 0, params.swapped != 0);
return src0[base + offset];
}
#enddecl(NO_SPLIT)
#decl(SPLIT)
@group(0) @binding(1)
var<storage, read_write> src1: array<{{TYPE}}>;
@group(0) @binding(2)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(3)
var<uniform> params: Params;
fn a_value(base: u32) -> {{TYPE}} {
return src0[base];
}
fn b_value(base: u32) -> {{TYPE}} {
return src1[base];
}
#enddecl(SPLIT)
#end(DECLS)
#define(SHADER)
enable f16;
struct Params {
offset_src0: u32,
offset_src1: u32,
offset_dst: u32,
// Strides (in elements)
stride_src01: u32,
stride_src02: u32,
stride_src03: u32,
stride_src11: u32,
stride_src12: u32,
stride_src13: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
// shape of dst
ne: u32,
ne0: u32,
ne1: u32,
ne2: u32,
swapped: u32,
alpha: f32,
limit: f32,
}
@group(0) @binding(0)
var<storage, read_write> src0: array<{{TYPE}}>;
DECLS
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x >= params.ne) {
return;
}
var i = gid.x;
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
i = i % (params.ne2 * params.ne1 * params.ne0);
let i2 = i / (params.ne1 * params.ne0);
i = i % (params.ne1 * params.ne0);
let i1 = i / params.ne0;
let i0 = i % params.ne0;
let i_a = params.offset_src0 + i3 * params.stride_src03 + i2 * params.stride_src02 + i1 * params.stride_src01 + i0;
let i_b = params.offset_src1 + i3 * params.stride_src13 + i2 * params.stride_src12 + i1 * params.stride_src11 + i0;
let i_dst = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1 + i0;
dst[i_dst] = op(a_value(i_a), b_value(i_b));
}
#end(SHADER)
@@ -1,44 +0,0 @@
#define(VARIANTS)
[
{
"REPLS": {
"TYPE" : "f32",
}
},
{
"REPLS": {
"TYPE" : "f16",
}
}
]
#end(VARIANTS)
#define(SHADER)
enable f16;
#include "binary_head.tmpl"
@group(0) @binding(0)
var<storage, read_write> src0: array<{{TYPE}}>;
@group(0) @binding(1)
var<storage, read_write> src1: array<{{TYPE}}>;
@group(0) @binding(2)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(3)
var<uniform> params: Params;
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x < params.ne) {
dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] * src1[params.offset_src1 + src1_index(gid.x)];
}
}
#end(SHADER)
@@ -1,41 +0,0 @@
#define(VARIANTS)
[
{
"REPLS": {
"TYPE" : "f32",
}
},
{
"REPLS": {
"TYPE" : "f16",
}
}
]
#end(VARIANTS)
#define(SHADER)
enable f16;
#include "binary_head.tmpl"
@group(0) @binding(0)
var<storage, read_write> src0: array<{{TYPE}}>;
@group(0) @binding(1)
var<storage, read_write> src1: array<{{TYPE}}>;
@group(0) @binding(2)
var<uniform> params: Params;
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x < params.ne) {
src0[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] * src1[params.offset_src1 + src1_index(gid.x)];
}
}
#end(SHADER)
@@ -1,9 +1,48 @@
@group(0) @binding(0)
var<storage, read_write> src: array<f32>;
#define(VARIANTS)
[
{
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_SUFFIX": "inplace",
"DECLS": ["INPLACE"]
},
]
#end(VARIANTS)
#define(DECLS)
#decl(NOT_INPLACE)
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
dst[dst_offset] = scale * src[src_offset];
}
@group(0) @binding(1)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(NOT_INPLACE)
#decl(INPLACE)
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
src[dst_offset] = scale * src[src_offset];
}
@group(0) @binding(1)
var<uniform> params: Params;
#enddecl(INPLACE)
#end(DECLS)
#define(SHADER)
struct Params {
offset_src: u32, // in elements
offset_dst: u32, // in elements
@@ -23,11 +62,13 @@ struct Params {
ne2: u32,
ne3: u32,
eps: u32
eps: f32
};
@group(0) @binding(2)
var<uniform> params: Params;
@group(0) @binding(0)
var<storage, read_write> src: array<f32>;
DECLS
override wg_size: u32;
@compute @workgroup_size(wg_size)
@@ -49,9 +90,9 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
for (var j: u32 = 0; j < params.ne0; j++) {
sum += src[i_src_row + j] * src[i_src_row + j];
}
let eps = bitcast<f32>(params.eps);
let scale = 1.0/sqrt(sum/f32(params.ne0) + eps);
let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
for (var j: u32 = 0; j < params.ne0; j++) {
dst[i_dst_row + j] = scale * src[i_src_row + j];
update(i_src_row + j, i_dst_row + j, scale);
}
}
#end(SHADER)
@@ -1,48 +0,0 @@
@group(0) @binding(0)
var<storage, read_write> a: array<f32>;
struct Params {
offset: u32, // in elements
// Strides (in elements)
stride1: u32,
stride2: u32,
stride3: u32,
// Shape
ne0: u32,
ne1: u32,
ne2: u32,
ne3: u32,
eps: u32
};
@group(0) @binding(1)
var<uniform> params: Params;
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x >= params.ne1 * params.ne2 * params.ne3) {
return;
}
// one thread per row
var i = gid.x;
let i3 = i / (params.ne2 * params.ne1);
i = i % (params.ne2 * params.ne1);
let i2 = i / params.ne1;
let i1 = i % params.ne1;
let i_row = params.offset + i3 * params.stride3 + i2 * params.stride2 + i1 * params.stride1;
var sum = 0.0f;
for (var j: u32 = 0; j < params.ne0; j++) {
sum += a[i_row + j] * a[i_row + j];
}
let eps = bitcast<f32>(params.eps);
let scale = 1.0/sqrt(sum/f32(params.ne0) + eps);
for (var j: u32 = 0; j < params.ne0; j++) {
a[i_row + j] = scale * a[i_row + j];
}
}
@@ -0,0 +1,282 @@
#define(VARIANTS)
[
{
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "ROTATE"]
},
{
"SHADER_SUFFIX": "f32_inplace",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["NO_FF_BINDINGS_INPLACE", "NO_FF_FUNC", "ROTATE_INPLACE"]
},
{
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "ROTATE"]
},
{
"SHADER_SUFFIX": "f16_inplace",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["NO_FF_BINDINGS_INPLACE", "NO_FF_FUNC", "ROTATE_INPLACE"]
},
{
"SHADER_SUFFIX": "f32_ff",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["FF_BINDINGS", "FF_FUNC", "ROTATE"]
},
{
"SHADER_SUFFIX": "f32_ff_inplace",
"REPLS": {
"TYPE" : "f32",
},
"DECLS": ["FF_BINDINGS_INPLACE", "FF_FUNC", "ROTATE_INPLACE"]
},
{
"SHADER_SUFFIX": "f16_ff",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["FF_BINDINGS", "FF_FUNC", "ROTATE"]
},
{
"SHADER_SUFFIX": "f16_ff_inplace",
"REPLS": {
"TYPE" : "f16",
},
"DECLS": ["FF_BINDINGS_INPLACE", "FF_FUNC", "ROTATE_INPLACE"]
}
]
#end(VARIANTS)
#define(DECLS)
#decl(ROTATE)
fn rotate(i_dst0: u32, i_dst1: u32, out0: f32, out1: f32) {
dst[i_dst0] = {{TYPE}}(out0);
dst[i_dst1] = {{TYPE}}(out1);
}
#enddecl(ROTATE)
#decl(ROTATE_INPLACE)
fn rotate(i_dst0: u32, i_dst1: u32, out0: f32, out1: f32) {
src0[i_dst0] = {{TYPE}}(out0);
src0[i_dst1] = {{TYPE}}(out1);
}
#enddecl(ROTATE_INPLACE)
#decl(NO_FF_FUNC)
fn freq_factor(i: u32) -> f32 {
return 1.0f;
}
#enddecl(NO_FF_FUNC)
#decl(FF_FUNC)
fn freq_factor(i: u32) -> f32 {
return src2[params.offset_src2 + i/2];
}
#enddecl(FF_FUNC)
#decl(NO_FF_BINDINGS)
@group(0) @binding(2)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(3)
var<uniform> params: Params;
#enddecl(NO_FF_BINDINGS)
#decl(NO_FF_BINDINGS_INPLACE)
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(NO_FF_BINDINGS_INPLACE)
#decl(FF_BINDINGS)
@group(0) @binding(2)
var<storage, read_write> src2: array<f32>;
@group(0) @binding(3)
var<storage, read_write> dst: array<{{TYPE}}>;
@group(0) @binding(4)
var<uniform> params: Params;
#enddecl(FF_BINDINGS)
#decl(FF_BINDINGS_INPLACE)
@group(0) @binding(2)
var<storage, read_write> src2: array<f32>;
@group(0) @binding(3)
var<uniform> params: Params;
#enddecl(FF_BINDINGS_INPLACE)
#end(DECLS)
#define(SHADER)
enable f16;
struct Params {
offset_src0: u32,
offset_src1: u32,
offset_src2: u32,
offset_dst: u32,
// Strides (in elements)
stride_src01: u32,
stride_src02: u32,
stride_src03: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
n_threads: u32,
ne0: u32,
ne1: u32,
ne2: u32,
n_dims: u32,
mode: u32,
theta_scale: f32,
attn_factor: f32,
freq_scale: f32,
ext_factor: f32,
corr_dim0: f32,
corr_dim1: f32,
sections0: u32,
sections1: u32,
sections2: u32,
sections3: u32
};
@group(0) @binding(0)
var<storage, read_write> src0: array<{{TYPE}}>;
@group(0) @binding(1)
var<storage, read_write> src1: array<i32>;
DECLS
fn rope_yarn_ramp(low: f32, high: f32, i: u32) -> f32 {
let y = (f32(i / 2) - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
}
// returns vector of (cos_theta, sin_theta)
// TODO: check performance of instantiating once on the CPU and passed as buffer, since it's repeated per-row
fn rope_yarn(theta_extrap: f32, i: u32) -> vec2<f32> {
var mscale = params.attn_factor;
var theta = params.freq_scale * theta_extrap;
if (params.ext_factor != 0.0f) {
let ramp_mix = rope_yarn_ramp(params.corr_dim0, params.corr_dim1, i) * params.ext_factor;
theta = theta * (1 - ramp_mix) + theta_extrap * ramp_mix;
mscale *= 1.0f + 0.1f * log(1.0f / params.freq_scale);
}
return vec2<f32>(cos(theta) * mscale, sin(theta) * mscale);
}
fn pair_base(i0: u32, div_2: bool) -> u32 {
if (div_2) {
return i0 / 2;
} else {
return i0;
}
}
fn pair_offset(is_neox: bool, is_mrope: bool, is_vision: bool) -> u32 {
if (is_vision) {
return params.n_dims;
} else if (is_neox || is_mrope) {
return params.n_dims / 2;
} else {
return 1;
}
}
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
// two elements per thread
if (gid.x >= params.n_threads) {
return;
}
let is_neox = bool(params.mode & 2);
let is_mrope = bool(params.mode & 8);
let is_vision = params.mode == 24;
var i = gid.x * 2; // start index for this thread
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
i = i % (params.ne2 * params.ne1 * params.ne0);
let i2 = i / (params.ne1 * params.ne0);
i = i % (params.ne1 * params.ne0);
let i1 = i / params.ne0;
let i0 = i % params.ne0;
let i_src_row = params.offset_src0 + i3 * params.stride_src03 + i2 * params.stride_src02 + i1 * params.stride_src01;
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
if (i0 >= params.n_dims && !is_vision) {
let i_src = i_src_row + i0;
let i_dst = i_dst_row + i0;
rotate(i_dst, i_dst + 1, f32(src0[i_src]), f32(src0[i_src + 1]));
return;
}
var theta_base_mult: u32 = 0;
var theta_scale_pwr: u32 = i0 / 2;
if (is_mrope) {
let sect_dims = params.sections0 + params.sections1 + params.sections2 + params.sections3;
let sec_w = params.sections1 + params.sections0;
let sec_e = params.sections2 + sec_w;
let sector = (i0 / 2) % sect_dims;
if (sector >= params.sections0 && sector < sec_w) {
theta_base_mult = 1;
if (is_vision) {
theta_scale_pwr = sector - params.sections0;
}
} else if (sector >= sec_w && sector < sec_e) {
theta_base_mult = 2;
if (is_vision) {
theta_scale_pwr = sector - sec_w;
}
} else if (sector >= sec_e) {
if (is_vision) {
theta_scale_pwr = sector - sec_e;
theta_scale_pwr = (i0 / 2) % sec_e;
}
theta_base_mult = 3;
} else if (is_vision) {
theta_scale_pwr = sector;
}
}
let theta_base = f32(src1[params.offset_src1 + i2 + params.ne2 * theta_base_mult]) * pow(params.theta_scale, f32(theta_scale_pwr));
let thetas = rope_yarn(theta_base/freq_factor(i0), i0);
let i_src = i_src_row + pair_base(i0, is_neox || is_mrope || is_vision);
let i_dst = i_dst_row + pair_base(i0, is_neox || is_mrope || is_vision);
let x0 = f32(src0[i_src]);
let x1 = f32(src0[i_src + pair_offset(is_neox, is_mrope, is_vision)]);
rotate(i_dst, i_dst + pair_offset(is_neox, is_mrope, is_vision), x0 * thetas.x - x1 * thetas.y, x0 * thetas.y + x1 * thetas.x);
}
#end(SHADER)
@@ -0,0 +1,90 @@
#define(VARIANTS)
[
{
"SHADER_NAME": "scale_f32",
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_NAME": "scale_f32_inplace",
"DECLS": ["INPLACE"]
}
]
#end(VARIANTS)
#define(DECLS)
#decl(NOT_INPLACE)
@group(0) @binding(1)
var<storage, read_write> dst: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
fn store_scale(val: f32, offset: u32) {
dst[offset] = val;
}
#enddecl(NOT_INPLACE)
#decl(INPLACE)
@group(0) @binding(1)
var<uniform> params: Params;
fn store_scale(val: f32, offset: u32) {
src[offset] = val;
}
#enddecl(INPLACE)
#end(DECLS)
#define(SHADER)
struct Params {
offset_src: u32,
offset_dst: u32,
// Strides (in elements)
stride_src1: u32,
stride_src2: u32,
stride_src3: u32,
stride_dst1: u32,
stride_dst2: u32,
stride_dst3: u32,
ne: u32,
ne0: u32,
ne1: u32,
ne2: u32,
scale: f32,
bias: f32
};
@group(0) @binding(0)
var<storage, read_write> src: array<f32>;
DECLS
override wg_size: u32;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x >= params.ne) {
return;
}
var i = gid.x;
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
i = i % (params.ne2 * params.ne1 * params.ne0);
let i2 = i / (params.ne1 * params.ne0);
i = i % (params.ne1 * params.ne0);
let i1 = i / params.ne0;
let i0 = i % params.ne0;
let i_src = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1 + i0;
let i_dst = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1 + i0;
store_scale(src[i_src] * params.scale + params.bias, i_dst);
}
#end(SHADER)
+5 -3
View File
@@ -4825,11 +4825,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
// Optional tensors
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
}
}
}
+73 -30
View File
@@ -2733,23 +2733,30 @@ struct test_scale : public test_case {
const std::array<int64_t, 4> ne;
float scale;
float bias;
bool inplace;
std::string vars() override {
return VARS_TO_STR4(type, ne, scale, bias);
return VARS_TO_STR5(type, ne, scale, bias, inplace);
}
test_scale(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 10},
float scale = 2.0f,
float bias = 0.0f)
: type(type), ne(ne), scale(scale), bias(bias) {}
float bias = 0.0f,
bool inplace = false)
: type(type), ne(ne), scale(scale), bias(bias), inplace(inplace) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_param(a);
ggml_set_name(a, "a");
ggml_tensor * out = ggml_scale_bias(ctx, a, scale, bias);
ggml_tensor * out;
if (inplace) {
out = ggml_scale_bias_inplace(ctx, a, scale, bias);
} else {
out = ggml_scale_bias(ctx, a, scale, bias);
}
ggml_set_name(out, "out");
return out;
@@ -2906,16 +2913,18 @@ struct test_rms_norm : public test_case {
const std::array<int64_t, 4> ne;
const bool v; // whether a is a non-contiguous view
const float eps;
const bool inplace; // whether to do the operation inplace
std::string vars() override {
return VARS_TO_STR4(type, ne, v, eps);
return VARS_TO_STR5(type, ne, v, eps, inplace);
}
test_rms_norm(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 5, 4, 3},
bool v = false,
float eps = 1e-6f)
: type(type), ne(ne), v(v), eps(eps) {}
float eps = 1e-6f,
bool inplace = false)
: type(type), ne(ne), v(v), eps(eps), inplace(inplace) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
@@ -2927,7 +2936,12 @@ struct test_rms_norm : public test_case {
ggml_set_name(a, "view of a");
}
ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
ggml_tensor * out;
if (inplace) {
out = ggml_rms_norm_inplace(ctx, a, eps);
} else {
out = ggml_rms_norm(ctx, a, eps);
}
ggml_set_name(out, "out");
return out;
@@ -3832,17 +3846,18 @@ struct test_rope : public test_case {
bool ff;
int v; // view (1 : non-contiguous a)
bool forward;
bool inplace;
std::string vars() override {
// forward can be inferred from the op, does not need to be printed
return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
return VARS_TO_STR11(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v, inplace);
}
test_rope(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f,
float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true)
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward) {}
int n_dims = 10, int mode = GGML_ROPE_TYPE_NORMAL, int n_ctx = 512, float fs = 1.0f,
float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true, bool inplace = false)
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward), inplace(inplace) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a;
@@ -3887,7 +3902,11 @@ struct test_rope : public test_case {
GGML_ASSERT(n_dims/4 > 0);
int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
if (forward) {
out = ggml_rope_multi (ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
if (inplace) {
out = ggml_rope_multi_inplace(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
} else {
out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
} else {
out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
@@ -3895,14 +3914,22 @@ struct test_rope : public test_case {
GGML_ASSERT(n_dims/3 > 0);
int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
if (forward) {
out = ggml_rope_multi (ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
if (inplace) {
out = ggml_rope_multi_inplace(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
} else {
out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
} else {
out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
}
} else {
if (forward) {
out = ggml_rope_ext (ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
if (inplace) {
out = ggml_rope_ext_inplace(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
} else {
out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
} else {
out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
@@ -6183,9 +6210,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
//add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
}
// single in-place tests, especially important for WebGPU backend since kernels for in-place vs. not are different
// single inplace tests, especially important for WebGPU backend since kernels for inplace vs. not are different
test_cases.emplace_back(new test_bin_bcast(ggml_add_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
test_cases.emplace_back(new test_bin_bcast(ggml_mul_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
test_cases.emplace_back(new test_bin_bcast(ggml_sub_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
test_cases.emplace_back(new test_bin_bcast(ggml_div_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
// fusion
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}, 2));
@@ -6200,6 +6229,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_add1());
test_cases.emplace_back(new test_scale());
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f, true)); // inplace test
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {100, 10, 10, 10}, 2.0f, 1.0f));
test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f));
test_cases.emplace_back(new test_silu_back());
@@ -6212,6 +6242,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
}
// in-place tests
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, false, 1e-6f, true));
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
@@ -6559,26 +6593,26 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
for (bool ff : {false, true}) { // freq_factors
for (float v : { 0, 1 }) {
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 7B
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 7B
if (all) {
test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 13B
test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 30B
test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 65B
test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 13B
test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 30B
test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 65B
}
if (all) {
test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 0, 512, fs, ef, af, ff, v, fw));
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 0, 512, fs, ef, af, ff, v, fw));
test_cases.emplace_back(new test_rope(type, { 80, 32, 4, 1}, 32, 0, 512, fs, ef, af, ff, v, fw));
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw));
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw));
test_cases.emplace_back(new test_rope(type, { 80, 32, 4, 1}, 32, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw));
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
test_cases.emplace_back(new test_rope(type, { 80, 32, 4, 1}, 32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
test_cases.emplace_back(new test_rope(type, { 80, 32, 4, 1}, 32, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
}
if (all) {
@@ -6589,7 +6623,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
}
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
}
}
@@ -6600,6 +6634,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
}
// single inplace test per type/mode/ff
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
for (int mode : {GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_VISION}) {
for (bool ff : {false, true}) {
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 0, true, true));
}
}
}
for (int v : { 0, 1, 2, 3 }) {
for (int dim : { 0, 1, 2, 3, }) {
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
Binary file not shown.
@@ -221,69 +221,66 @@ class ChatStore {
*/
private getApiOptions(): Record<string, unknown> {
const currentConfig = config();
const hasValue = (value: unknown): boolean =>
value !== undefined && value !== null && value !== '';
const apiOptions: Record<string, unknown> = {
stream: true,
timings_per_token: true
};
if (currentConfig.temperature !== undefined && currentConfig.temperature !== null) {
if (hasValue(currentConfig.temperature)) {
apiOptions.temperature = Number(currentConfig.temperature);
}
if (currentConfig.max_tokens !== undefined && currentConfig.max_tokens !== null) {
if (hasValue(currentConfig.max_tokens)) {
apiOptions.max_tokens = Number(currentConfig.max_tokens);
}
if (currentConfig.dynatemp_range !== undefined && currentConfig.dynatemp_range !== null) {
if (hasValue(currentConfig.dynatemp_range)) {
apiOptions.dynatemp_range = Number(currentConfig.dynatemp_range);
}
if (currentConfig.dynatemp_exponent !== undefined && currentConfig.dynatemp_exponent !== null) {
if (hasValue(currentConfig.dynatemp_exponent)) {
apiOptions.dynatemp_exponent = Number(currentConfig.dynatemp_exponent);
}
if (currentConfig.top_k !== undefined && currentConfig.top_k !== null) {
if (hasValue(currentConfig.top_k)) {
apiOptions.top_k = Number(currentConfig.top_k);
}
if (currentConfig.top_p !== undefined && currentConfig.top_p !== null) {
if (hasValue(currentConfig.top_p)) {
apiOptions.top_p = Number(currentConfig.top_p);
}
if (currentConfig.min_p !== undefined && currentConfig.min_p !== null) {
if (hasValue(currentConfig.min_p)) {
apiOptions.min_p = Number(currentConfig.min_p);
}
if (currentConfig.xtc_probability !== undefined && currentConfig.xtc_probability !== null) {
if (hasValue(currentConfig.xtc_probability)) {
apiOptions.xtc_probability = Number(currentConfig.xtc_probability);
}
if (currentConfig.xtc_threshold !== undefined && currentConfig.xtc_threshold !== null) {
if (hasValue(currentConfig.xtc_threshold)) {
apiOptions.xtc_threshold = Number(currentConfig.xtc_threshold);
}
if (currentConfig.typ_p !== undefined && currentConfig.typ_p !== null) {
if (hasValue(currentConfig.typ_p)) {
apiOptions.typ_p = Number(currentConfig.typ_p);
}
if (currentConfig.repeat_last_n !== undefined && currentConfig.repeat_last_n !== null) {
if (hasValue(currentConfig.repeat_last_n)) {
apiOptions.repeat_last_n = Number(currentConfig.repeat_last_n);
}
if (currentConfig.repeat_penalty !== undefined && currentConfig.repeat_penalty !== null) {
if (hasValue(currentConfig.repeat_penalty)) {
apiOptions.repeat_penalty = Number(currentConfig.repeat_penalty);
}
if (currentConfig.presence_penalty !== undefined && currentConfig.presence_penalty !== null) {
if (hasValue(currentConfig.presence_penalty)) {
apiOptions.presence_penalty = Number(currentConfig.presence_penalty);
}
if (currentConfig.frequency_penalty !== undefined && currentConfig.frequency_penalty !== null) {
if (hasValue(currentConfig.frequency_penalty)) {
apiOptions.frequency_penalty = Number(currentConfig.frequency_penalty);
}
if (currentConfig.dry_multiplier !== undefined && currentConfig.dry_multiplier !== null) {
if (hasValue(currentConfig.dry_multiplier)) {
apiOptions.dry_multiplier = Number(currentConfig.dry_multiplier);
}
if (currentConfig.dry_base !== undefined && currentConfig.dry_base !== null) {
if (hasValue(currentConfig.dry_base)) {
apiOptions.dry_base = Number(currentConfig.dry_base);
}
if (
currentConfig.dry_allowed_length !== undefined &&
currentConfig.dry_allowed_length !== null
) {
if (hasValue(currentConfig.dry_allowed_length)) {
apiOptions.dry_allowed_length = Number(currentConfig.dry_allowed_length);
}
if (
currentConfig.dry_penalty_last_n !== undefined &&
currentConfig.dry_penalty_last_n !== null
) {
if (hasValue(currentConfig.dry_penalty_last_n)) {
apiOptions.dry_penalty_last_n = Number(currentConfig.dry_penalty_last_n);
}
if (currentConfig.samplers) {