Compare commits

...

11 Commits

Author SHA1 Message Date
dm4 ea3b0590ee embedding : free the batch after execution (#7297) 2024-05-15 15:01:12 +03:00
Georgi Gerganov 29499bb593 sync : ggml 2024-05-15 13:23:41 +03:00
John Balis 48aa8fd1f2 ggml : add ggml_upscale_ext (ggml/814)
* initial commit with CPU implementation of upscale to shape and test, cuda implementation next

* experimental commit to see if dst shape is correct

* test version

* test

* removed unnecessary params

* refactor

* fixed tests

* ggml : metal impl + cleanup + sycl dev warnings

* patched ggml_upscale cuda op to handle non-contiguous tensors, added test for non-contiguous behavior

* metal : fix upsacle op to support nb00 + style

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-05-15 13:23:33 +03:00
Johannes Gäßler 583fd6b000 server bench: fix bench not waiting for model load (#7284) 2024-05-15 08:44:16 +02:00
Georgi Gerganov 9f773486ab script : sync ggml-rpc 2024-05-14 19:14:38 +03:00
Georgi Gerganov e8a7fd4fb0 metal : support FA without mask + add asserts (#7278)
* ggml : fa without mask + add asserts

ggml-ci

* metal : support non-contiguous KV

ggml-ci
2024-05-14 19:09:30 +03:00
Georgi Gerganov a5e3fde857 sync : ggml
ggml-ci
2024-05-14 19:08:09 +03:00
Georgi Gerganov f308ea7059 metal : tune soft_max number of threads (whisper/0) 2024-05-14 19:08:09 +03:00
Georgi Gerganov c3c88f296a ggml : try fix ppc64 (whisper/0) 2024-05-14 19:08:09 +03:00
Przemysław Pawełczyk 182adefcf3 ggml : expose SSE3 and SSSE3 for MSVC when AVX is available (whisper/2128) 2024-05-14 19:08:09 +03:00
Hong Bo PENG 0d26d8ccd8 ggml : optimize for ppc64le using VSX intrinsics (ggml/784)
* optimize for ppc64le using VSX intrinsics

* 1. code clean up by removing comments about overflow concern.

2. fix typo in suffix of scaling.

* Continue to fix typo in suffix of scaling for QK_K <> 256

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-05-14 19:08:09 +03:00
14 changed files with 2425 additions and 147 deletions
+1
View File
@@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
// clean up
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
+8 -7
View File
@@ -293,13 +293,14 @@ def start_server_background(args):
def is_server_listening(server_fqdn, server_port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((server_fqdn, server_port))
_is_server_listening = result == 0
if _is_server_listening:
print(f"server is listening on {server_fqdn}:{server_port}...")
return _is_server_listening
try:
url = f"{server_fqdn}:{server_port}/health"
if not url.startswith("http://"):
url = f"http://{url}"
result = requests.get(url)
return result.status_code == 200
except Exception:
return False
def escape_metric_name(metric_name):
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
+33 -30
View File
@@ -1,35 +1,36 @@
#include "upscale.cuh"
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
// blockIdx.z: idx of ne02*ne03
// blockIdx.y: idx of ne01*scale_factor aka ne1
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
// ne00xne01: ne00 * ne01
int ne0 = ne00 * scale_factor;
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
static __global__ void upscale_f32(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int ne13,
const float sf0, const float sf1, const float sf2, const float sf3) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= ne10 * ne11 * ne12 * ne13) {
return;
}
// operation
int i00 = nidx / scale_factor;
int i01 = blockIdx.y / scale_factor;
int offset_src =
i00 +
i01 * ne00 +
blockIdx.z * ne00xne01;
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
dst[offset_dst] = x[offset_src];
int i10 = index % ne10;
int i11 = (index / ne10) % ne11;
int i12 = (index / (ne10 * ne11)) % ne12;
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
int i00 = i10 / sf0;
int i01 = i11 / sf1;
int i02 = i12 / sf2;
int i03 = i13 / sf3;
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
}
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
const int scale_factor, cudaStream_t stream) {
int ne0 = (ne00 * scale_factor);
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
static void upscale_f32_cuda(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int ne13,
const float sf0, const float sf1, const float sf2, const float sf3,
cudaStream_t stream) {
int dst_size = ne10 * ne11 * ne12 * ne13;
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
}
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int scale_factor = dst->op_params[0];
const float sf0 = (float)dst->ne[0]/src0->ne[0];
const float sf1 = (float)dst->ne[1]/src0->ne[1];
const float sf2 = (float)dst->ne[2]/src0->ne[2];
const float sf3 = (float)dst->ne[3]/src0->ne[3];
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
}
+7
View File
@@ -120,9 +120,16 @@ extern "C" {
#ifndef __F16C__
#define __F16C__
#endif
#endif
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
#ifndef __SSE3__
#define __SSE3__
#endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#endif
// 16-bit float
+48 -35
View File
@@ -1378,7 +1378,7 @@ static enum ggml_status ggml_metal_graph_compute(
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (ne00%4 == 0) {
while (nth < ne00/4 && nth < 256) {
while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
nth *= 2;
}
if (use_f16) {
@@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
}
} else {
while (nth < ne00 && nth < 1024) {
while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
nth *= 2;
}
if (use_f16) {
@@ -2353,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int sf = dst->op_params[0];
const float sf0 = (float)ne0/src0->ne[0];
const float sf1 = (float)ne1/src0->ne[1];
const float sf2 = (float)ne2/src0->ne[2];
const float sf3 = (float)ne3/src0->ne[3];
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
@@ -2376,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
@@ -2512,13 +2518,14 @@ static enum ggml_status ggml_metal_graph_compute(
} break;
case GGML_OP_FLASH_ATTN_EXT:
{
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ne11 % 32 == 0);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
GGML_ASSERT(ggml_are_same_shape (src1, src2));
GGML_ASSERT(ggml_are_same_shape(src1, src2));
GGML_ASSERT(src3);
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
size_t offs_src3 = 0;
@@ -2528,6 +2535,11 @@ static enum ggml_status ggml_metal_graph_compute(
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
@@ -2590,34 +2602,35 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
if (id_src3) {
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
} else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
}
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:21];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:23];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:24];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:25];
[encoder setBytes:&scale length:sizeof( float) atIndex:26];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:27];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:28];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:29];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:30];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
if (!use_vec_kernel) {
// half8x8 kernel
+33 -41
View File
@@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int32_t & sf,
constant float & sf0,
constant float & sf1,
constant float & sf2,
constant float & sf3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
@@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1/sf;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
const int64_t i03 = i3/sf3;
const int64_t i02 = i2/sf2;
const int64_t i01 = i1/sf1;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = src0_ptr[i0/sf];
const int64_t i00 = i0/sf0;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_ptr[0] = src0_ptr[0];
}
}
@@ -2049,27 +2054,24 @@ typedef void (flash_attn_ext_f16_t)(
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
@@ -2090,27 +2092,24 @@ kernel void kernel_flash_attn_ext_f16(
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
@@ -2180,10 +2179,6 @@ kernel void kernel_flash_attn_ext_f16(
const short ne22 = ne12;
const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast
const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13;
@@ -2247,11 +2242,16 @@ kernel void kernel_flash_attn_ext_f16(
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
}
// mqk = mqk*scale + mask*slope
simdgroup_half8x8 mm;
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
simdgroup_multiply(mm, mslope, mm);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
if (mask != q) {
// mqk = mqk*scale + mask*slope
simdgroup_half8x8 mm;
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
simdgroup_multiply(mm, mslope, mm);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
} else {
// mqk = mqk*scale
simdgroup_multiply(mqk, mscale, mqk);
}
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
}
@@ -2425,27 +2425,24 @@ kernel void kernel_flash_attn_ext_vec_f16(
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
@@ -2521,10 +2518,6 @@ kernel void kernel_flash_attn_ext_vec_f16(
const short ne22 = ne12;
const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast
const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13;
@@ -2589,8 +2582,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
// mqk = mqk*scale + mask*slope
if (tiisg == 0) {
float4 mm = (float4) mp4[ic/4 + cc];
mqk = mqk*scale + mm*slope;
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
ss4[cc] = mqk;
}
+2167 -2
View File
File diff suppressed because it is too large Load Diff
+4
View File
@@ -13987,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
#pragma message("TODO: generalize upscale operator")
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
GGML_ASSERT(false && "TODO: generalize upscale operator);
const int scale_factor = dst->op_params[0];
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
+59 -17
View File
@@ -1306,6 +1306,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
// Use vec_xl, not vec_ld, in case the load address is not aligned.
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
@@ -2822,6 +2824,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
(t0->ne[3] == t1->ne[3] );
}
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
(t0->nb[0] == t1->nb[0] ) &&
(t0->nb[1] == t1->nb[1] ) &&
(t0->nb[2] == t1->nb[2] ) &&
(t0->nb[3] == t1->nb[3] );
}
// check if t1 can be represented as a repeatition of t0
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@@ -6281,7 +6293,10 @@ struct ggml_tensor * ggml_pool_2d(
static struct ggml_tensor * ggml_upscale_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor) {
int ne0,
int ne1,
int ne2,
int ne3) {
bool is_node = false;
if (a->grad) {
@@ -6289,19 +6304,45 @@ static struct ggml_tensor * ggml_upscale_impl(
is_node = true;
}
GGML_ASSERT(a->ne[0] <= ne0);
GGML_ASSERT(a->ne[1] <= ne1);
GGML_ASSERT(a->ne[2] <= ne2);
GGML_ASSERT(a->ne[3] <= ne3);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
a->ne[0] * scale_factor,
a->ne[1] * scale_factor,
a->ne[2], a->ne[3]);
ne0,
ne1,
ne2,
ne3
);
result->op = GGML_OP_UPSCALE;
result->op_params[0] = scale_factor;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor) {
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
}
struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3) {
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
}
// ggml_pad
struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -6326,12 +6367,7 @@ struct ggml_tensor * ggml_pad(
return result;
}
struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor) {
return ggml_upscale_impl(ctx, a, scale_factor);
}
// ggml_arange
struct ggml_tensor * ggml_arange(
struct ggml_context * ctx,
@@ -6353,6 +6389,8 @@ struct ggml_tensor * ggml_arange(
return result;
}
// ggml_timestep_embedding
struct ggml_tensor * ggml_timestep_embedding(
struct ggml_context * ctx,
struct ggml_tensor * timesteps,
@@ -14808,25 +14846,28 @@ static void ggml_compute_forward_upscale_f32(
return;
}
GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
const int scale_factor = dst->op_params[0];
const float sf0 = (float)ne0/src0->ne[0];
const float sf1 = (float)ne1/src0->ne[1];
const float sf2 = (float)ne2/src0->ne[2];
const float sf3 = (float)ne3/src0->ne[3];
// TODO: optimize
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3;
const int64_t i03 = i3 / sf3;
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
const int64_t i02 = i2;
const int64_t i02 = i2 / sf2;
for (int64_t i1 = 0; i1 < ne1; i1++) {
const int64_t i01 = i1 / scale_factor;
const int64_t i01 = i1 / sf1;
for (int64_t i0 = 0; i0 < ne0; i0++) {
const int64_t i00 = i0 / scale_factor;
const int64_t i00 = i0 / sf0;
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
@@ -14856,6 +14897,7 @@ static void ggml_compute_forward_upscale(
}
}
// ggml_compute_forward_pad
static void ggml_compute_forward_pad_f32(
+14 -1
View File
@@ -766,7 +766,8 @@ extern "C" {
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
@@ -1673,12 +1674,24 @@ extern "C" {
float p1);
// nearest interpolate
// multiplies ne0 and ne1 by scale factor
// used in stable-diffusion
GGML_API struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor);
// nearest interpolate
// nearest interpolate to specified dimensions
// used in tortoise.cpp
GGML_API struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3);
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
GGML_API struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,
+4
View File
@@ -112,6 +112,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-opencl.h -> ggml-opencl.h
# src/ggml-quants.c -> ggml-quants.c
# src/ggml-quants.h -> ggml-quants.h
# src/ggml-rpc.cpp -> ggml-rpc.cpp
# src/ggml-rpc.h -> ggml-rpc.h
# src/ggml-sycl.cpp -> ggml-sycl.cpp
# src/ggml-sycl.h -> ggml-sycl.h
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
@@ -149,6 +151,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
-e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
-e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
+1 -1
View File
@@ -1 +1 @@
30f54cbb3ada3e4c5bc6924de3e5918e5be4ff11
126d34985705a5a2222723c145cb4e125ac689f3
+2
View File
@@ -20,6 +20,8 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
cp -rpv ../ggml/src/ggml-rpc.h ./ggml-rpc.h
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
+44 -13
View File
@@ -1329,23 +1329,47 @@ struct test_upscale : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const int32_t scale_factor;
const bool transpose;
std::string vars() override {
return VARS_TO_STR3(type, ne, scale_factor);
return VARS_TO_STR4(type, ne, scale_factor, transpose);
}
test_upscale(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {512, 512, 3, 1},
int32_t scale_factor = 2)
: type(type), ne(ne), scale_factor(scale_factor) {}
int32_t scale_factor = 2, bool transpose = false)
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
if (transpose) a = ggml_transpose(ctx, a);
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
return out;
}
};
// GGML_OP_UPSCALE (ext)
struct test_upscale_ext : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const std::array<int64_t, 4> ne_tgt;
std::string vars() override {
return VARS_TO_STR3(type, ne, ne_tgt);
}
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {2, 5, 7, 11},
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
: type(type), ne(ne), ne_tgt(ne_tgt) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
return out;
}
};
// GGML_OP_GROUP_NORM
struct test_group_norm : public test_case {
const ggml_type type;
@@ -1487,25 +1511,27 @@ struct test_flash_attn_ext : public test_case {
const int64_t kv; // kv size
const int64_t nb; // batch size
const bool mask; // use mask
const float max_bias; // ALiBi
std::string vars() override {
return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias);
}
double max_nmse_err() override {
return 5e-4;
}
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
: hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
return out;
}
};
@@ -2167,6 +2193,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_sum_rows());
test_cases.emplace_back(new test_upscale());
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
test_cases.emplace_back(new test_upscale_ext());
test_cases.emplace_back(new test_group_norm());
test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_pad());
@@ -2175,11 +2203,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_leaky_relu());
for (int hs : { 64, 80, 128, 256, }) {
for (float max_bias : {0.0f, 8.0f}) {
for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) {
for (int nb : { 1, 2, 4, 8, }) {
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
for (bool mask : { true, false } ) {
for (float max_bias : { 0.0f, 8.0f }) {
if (!mask && max_bias > 0.0f) continue;
for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) {
for (int nb : { 1, 2, 4, 8, }) {
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias));
}
}
}
}