mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-02 18:47:43 +02:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 83f5872404 | |||
| f0d4d176df | |||
| b17230917c | |||
| bf9087f59a | |||
| 9fb1042ce6 | |||
| 2adf8d83ac | |||
| 021cc28bef | |||
| d498af3d5a | |||
| eacdeb5bfc | |||
| e0cb5c5cb8 |
@@ -843,6 +843,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
|
||||
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
|
||||
res = "lfm2"
|
||||
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
|
||||
res = "exaone4"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -6780,6 +6783,75 @@ class ExaoneModel(TextModel):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||
|
||||
|
||||
@ModelBase.register("Exaone4ForCausalLM")
|
||||
class Exaone4Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.EXAONE4
|
||||
|
||||
def set_vocab(self):
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if hparams.get("sliding_window") is not None:
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
if "layer_types" in hparams:
|
||||
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
|
||||
elif "sliding_window_pattern" in hparams:
|
||||
sliding_window_pattern = []
|
||||
if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG
|
||||
for i in range(hparams["num_hidden_layers"]):
|
||||
sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
|
||||
if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4
|
||||
for i in range(hparams["num_hidden_layers"]):
|
||||
sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
|
||||
if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
|
||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
base = self.hparams.get("rope_theta", 10_000.0)
|
||||
if (dim := self.hparams.get("head_dim")) is None:
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
||||
factor = rope_scaling.get("factor", 16.0)
|
||||
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
rope_factors = []
|
||||
for freq in freqs:
|
||||
wavelen = 2 * math.pi / freq
|
||||
if wavelen < high_freq_wavelen:
|
||||
rope_factors.append(1)
|
||||
elif wavelen > low_freq_wavelen:
|
||||
rope_factors.append(factor)
|
||||
else:
|
||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||
|
||||
|
||||
@ModelBase.register("GraniteForCausalLM")
|
||||
class GraniteModel(LlamaModel):
|
||||
"""Conversion for IBM's GraniteForCausalLM"""
|
||||
|
||||
@@ -129,6 +129,7 @@ models = [
|
||||
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
|
||||
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
|
||||
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
||||
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
+18
-22
@@ -305,9 +305,8 @@ On Linux it is possible to use unified memory architecture (UMA) to share main m
|
||||
|
||||
## Vulkan
|
||||
|
||||
**Windows**
|
||||
|
||||
### w64devkit
|
||||
### For Windows Users:
|
||||
**w64devkit**
|
||||
|
||||
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
|
||||
|
||||
@@ -334,7 +333,7 @@ cmake -B build -DGGML_VULKAN=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
### Git Bash MINGW64
|
||||
**Git Bash MINGW64**
|
||||
|
||||
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
|
||||
|
||||
@@ -357,7 +356,8 @@ Now you can load the model in conversation mode using `Vulkan`
|
||||
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
|
||||
```
|
||||
|
||||
### MSYS2
|
||||
**MSYS2**
|
||||
|
||||
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
||||
```sh
|
||||
pacman -S git \
|
||||
@@ -373,9 +373,9 @@ cmake -B build -DGGML_VULKAN=ON
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
**With docker**:
|
||||
### For Docker users:
|
||||
|
||||
You don't need to install Vulkan SDK. It will be installed inside the container.
|
||||
You don't need to install the Vulkan SDK. It will be installed inside the container.
|
||||
|
||||
```sh
|
||||
# Build the image
|
||||
@@ -385,32 +385,28 @@ docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
|
||||
**Without docker**:
|
||||
### For Linux users:
|
||||
|
||||
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
|
||||
First, follow the the official [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
|
||||
|
||||
For example, on Ubuntu 22.04 (jammy), use the command below:
|
||||
> [!IMPORTANT]
|
||||
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
|
||||
|
||||
Second, after verifying that you have done everything in the Vulkan SDK guide provided in the first step, run the following command to verify that everything is set up correctly:
|
||||
```bash
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
apt update -y
|
||||
apt-get install -y vulkan-sdk
|
||||
# To verify the installation, use the command below:
|
||||
vulkaninfo
|
||||
```
|
||||
|
||||
Alternatively your package manager might be able to provide the appropriate libraries.
|
||||
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||
|
||||
Then, build llama.cpp using the cmake command below:
|
||||
|
||||
Then, assuming you have `cd` into your llama.cpp folder and there are no errors with running `vulkaninfo`, you can proceed to build llama.cpp using the CMake commands below:
|
||||
```bash
|
||||
cmake -B build -DGGML_VULKAN=1
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
Finally, after finishing your build, you should be able to do this:
|
||||
```bash
|
||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
./build/bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
|
||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||
|
||||
@@ -184,6 +184,9 @@ int main(int argc, char ** argv) {
|
||||
// extra text to insert in each client's prompt in order to make it larger
|
||||
const int32_t n_junk = std::max(1, params.n_junk);
|
||||
|
||||
// signed seed, use negative values to indicate different seeds for the different clients
|
||||
const int32_t & sseed = params.sampling.seed;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -219,12 +222,21 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
if (sseed >= 0) {
|
||||
LOG_INF("%s: initializing all samplers with the same RNG seed: %d (use a negative seed to have different seeds)\n", __func__, sseed);
|
||||
} else {
|
||||
LOG_INF("%s: initializing samplers with different RNG seeds, starting from %d\n", __func__, sseed);
|
||||
}
|
||||
|
||||
std::vector<client> clients(n_clients);
|
||||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
auto & client = clients[i];
|
||||
client.id = i;
|
||||
client.smpl = common_sampler_init(model, params.sampling);
|
||||
//params.sampling.seed++;
|
||||
|
||||
if (sseed < 0) {
|
||||
params.sampling.seed--;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens_system;
|
||||
|
||||
@@ -22,21 +22,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
||||
return t->view_src != NULL;
|
||||
}
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (a->ne[i] != b->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (a->nb[i] != b->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ops that return true for this function must not use restrict pointers for their backend implementations
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
|
||||
@@ -352,21 +352,6 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
||||
|
||||
// backend copy
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (a->ne[i] != b->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (a->nb[i] != b->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
||||
|
||||
|
||||
@@ -2590,6 +2590,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
||||
|
||||
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
||||
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2611,9 +2614,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
#endif
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
||||
// disable CUDA graphs for batch size > 1 for now.
|
||||
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
||||
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
|
||||
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
|
||||
// by means of matching node names. See
|
||||
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
|
||||
// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
|
||||
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
|
||||
use_cuda_graph = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
||||
|
||||
@@ -73,6 +73,22 @@ static inline int ggml_up(int n, int m) {
|
||||
return (n + m - 1) & ~(m - 1);
|
||||
}
|
||||
|
||||
// TODO: move to ggml.h?
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (a->ne[i] != b->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (a->nb[i] != b->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
@@ -126,6 +126,7 @@ typedef struct {
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
uint64_t offs;
|
||||
uint64_t o1[8];
|
||||
} ggml_metal_kargs_bin;
|
||||
|
||||
typedef struct {
|
||||
@@ -240,7 +241,7 @@ typedef struct {
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint16_t n_head_log2;
|
||||
int32_t n_head_log2;
|
||||
float logit_softcap;
|
||||
} ggml_metal_kargs_flash_attn_ext;
|
||||
|
||||
@@ -377,8 +378,16 @@ typedef struct {
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne00_4;
|
||||
uint64_t nb01;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
float eps;
|
||||
int32_t nef1[3];
|
||||
int32_t nef2[3];
|
||||
int32_t nef3[3];
|
||||
uint64_t nbf1[3];
|
||||
uint64_t nbf2[3];
|
||||
uint64_t nbf3[3];
|
||||
} ggml_metal_kargs_rms_norm;
|
||||
|
||||
typedef struct {
|
||||
@@ -484,7 +493,7 @@ typedef struct {
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint32_t n_head_log2;
|
||||
int32_t n_head_log2;
|
||||
} ggml_metal_kargs_soft_max;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -55,6 +55,12 @@ static struct ggml_backend_metal_device_context {
|
||||
bool has_residency_sets;
|
||||
bool has_bfloat;
|
||||
bool use_bfloat;
|
||||
bool use_fusion;
|
||||
|
||||
int debug_fusion;
|
||||
|
||||
// how many times a given op was fused
|
||||
uint64_t fuse_cnt[GGML_OP_COUNT];
|
||||
|
||||
size_t max_size;
|
||||
|
||||
@@ -69,6 +75,9 @@ static struct ggml_backend_metal_device_context {
|
||||
/*.has_residency_sets =*/ false,
|
||||
/*.has_bfloat =*/ false,
|
||||
/*.use_bfloat =*/ false,
|
||||
/*.use_fusion =*/ true,
|
||||
/*.debug_fusion =*/ 0,
|
||||
/*.fuse_cnt =*/ { 0 },
|
||||
/*.max_size =*/ 0,
|
||||
/*.name =*/ "",
|
||||
};
|
||||
@@ -83,16 +92,14 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
||||
|
||||
if (ctx->mtl_device == nil) {
|
||||
ctx->mtl_device = MTLCreateSystemDefaultDevice();
|
||||
}
|
||||
|
||||
if (ctx->mtl_device) {
|
||||
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||
|
||||
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
|
||||
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil;
|
||||
#endif
|
||||
|
||||
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||
@@ -103,6 +110,14 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
||||
#else
|
||||
ctx->use_bfloat = false;
|
||||
#endif
|
||||
ctx->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
|
||||
|
||||
{
|
||||
const char * val = getenv("GGML_METAL_FUSION_DEBUG");
|
||||
ctx->debug_fusion = val ? atoi(val) : 0;
|
||||
}
|
||||
|
||||
memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt));
|
||||
|
||||
ctx->max_size = ctx->mtl_device.maxBufferLength;
|
||||
|
||||
@@ -122,6 +137,18 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
ctx->mtl_device_ref_count--;
|
||||
|
||||
if (ctx->mtl_device_ref_count == 0) {
|
||||
if (ctx->debug_fusion > 0) {
|
||||
fprintf(stderr, "%s: fusion stats:\n", __func__);
|
||||
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
||||
if (ctx->fuse_cnt[i] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// note: cannot use ggml_log here
|
||||
fprintf(stderr, "%s: - %s: %" PRIu64 "\n", __func__, ggml_op_name((enum ggml_op) i), ctx->fuse_cnt[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->mtl_lock) {
|
||||
[ctx->mtl_lock release];
|
||||
ctx->mtl_lock = nil;
|
||||
@@ -147,13 +174,27 @@ struct ggml_metal_kernel {
|
||||
|
||||
enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_ADD,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_FUSE_2,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_FUSE_3,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_FUSE_4,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_FUSE_5,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_FUSE_6,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_FUSE_7,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_FUSE_8,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7,
|
||||
GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8,
|
||||
GGML_METAL_KERNEL_TYPE_SUB,
|
||||
GGML_METAL_KERNEL_TYPE_SUB_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_SUB_ROW_C4,
|
||||
GGML_METAL_KERNEL_TYPE_MUL,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_ROW_C4,
|
||||
GGML_METAL_KERNEL_TYPE_DIV,
|
||||
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_DIV_ROW_C4,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F32,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F16,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_I32,
|
||||
@@ -218,6 +259,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1,
|
||||
GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_RMS_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL,
|
||||
GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD,
|
||||
GGML_METAL_KERNEL_TYPE_L2_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_NORM,
|
||||
@@ -1135,13 +1178,27 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
// simd_sum and simd_max requires MTLGPUFamilyApple7
|
||||
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_2, add_fuse_2, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_3, add_fuse_3, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_4, add_fuse_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_5, add_fuse_5, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_6, add_fuse_6, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_7, add_fuse_7, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_8, add_fuse_8, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4, add_row_c4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2, add_row_c4_fuse_2, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3, add_row_c4_fuse_3, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4, add_row_c4_fuse_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5, add_row_c4_fuse_5, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6, add_row_c4_fuse_6, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7, add_row_c4_fuse_7, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8, add_row_c4_fuse_8, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW_C4, sub_row_c4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW_C4, mul_row_c4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW_C4, div_row_c4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true);
|
||||
@@ -1206,6 +1263,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, set_rows_q5_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, set_rows_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL, rms_norm_mul, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD, rms_norm_mul_add, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
|
||||
@@ -1893,7 +1952,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_metal_encode_node(
|
||||
static int ggml_metal_encode_node(
|
||||
ggml_backend_t backend,
|
||||
int idx,
|
||||
id<MTLComputeCommandEncoder> encoder,
|
||||
@@ -1903,7 +1962,10 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
struct ggml_cgraph * gf = ctx->gf;
|
||||
|
||||
struct ggml_tensor * node = ggml_graph_node(gf, idx);
|
||||
enum ggml_op ops[8];
|
||||
|
||||
struct ggml_tensor ** nodes = ggml_graph_nodes(gf) + idx;
|
||||
struct ggml_tensor * node = nodes[0];
|
||||
|
||||
//GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
|
||||
|
||||
@@ -1913,7 +1975,7 @@ static bool ggml_metal_encode_node(
|
||||
struct ggml_tensor * dst = node;
|
||||
|
||||
if (ggml_is_empty(dst)) {
|
||||
return true;
|
||||
return 1;
|
||||
}
|
||||
|
||||
switch (dst->op) {
|
||||
@@ -1924,7 +1986,7 @@ static bool ggml_metal_encode_node(
|
||||
case GGML_OP_PERMUTE:
|
||||
{
|
||||
// noop -> next node
|
||||
} return true;
|
||||
} return 1;
|
||||
default:
|
||||
{
|
||||
} break;
|
||||
@@ -1991,6 +2053,8 @@ static bool ggml_metal_encode_node(
|
||||
id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
|
||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
#if 0
|
||||
GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||
if (src0) {
|
||||
@@ -2062,37 +2126,15 @@ static bool ggml_metal_encode_node(
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src1));
|
||||
|
||||
const size_t offs = 0;
|
||||
|
||||
bool bcast_row = false;
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
|
||||
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bcast_row = true;
|
||||
} else {
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
|
||||
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
ggml_metal_kargs_bin args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
@@ -2119,12 +2161,117 @@ static bool ggml_metal_encode_node(
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.offs =*/ offs,
|
||||
/*.o1 =*/ { offs_src1 },
|
||||
};
|
||||
|
||||
// c[0] = add(a, b[0])
|
||||
// c[1] = add(c[0], b[1])
|
||||
// c[2] = add(c[1], b[2])
|
||||
// ...
|
||||
if (ctx_dev->use_fusion) {
|
||||
ops[0] = GGML_OP_ADD;
|
||||
ops[1] = GGML_OP_ADD;
|
||||
ops[2] = GGML_OP_ADD;
|
||||
ops[3] = GGML_OP_ADD;
|
||||
ops[4] = GGML_OP_ADD;
|
||||
ops[5] = GGML_OP_ADD;
|
||||
ops[6] = GGML_OP_ADD;
|
||||
ops[7] = GGML_OP_ADD;
|
||||
|
||||
size_t offs_fuse;
|
||||
id<MTLBuffer> id_fuse;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 6; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
// b[0] === b[1] === ...
|
||||
if (!ggml_are_same_layout(nodes[n_fuse]->src[1], nodes[n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
|
||||
// only fuse nodes if src1 is in the same Metal buffer
|
||||
id_fuse = ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse);
|
||||
if (id_fuse != id_src1) {
|
||||
break;
|
||||
}
|
||||
|
||||
ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++;
|
||||
|
||||
args.o1[n_fuse + 1] = offs_fuse;
|
||||
}
|
||||
|
||||
++n_fuse;
|
||||
|
||||
if (ctx_dev->debug_fusion > 1 && n_fuse > 1) {
|
||||
GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse);
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
switch (n_fuse) {
|
||||
case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4 ].pipeline; break;
|
||||
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2].pipeline; break;
|
||||
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3].pipeline; break;
|
||||
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4].pipeline; break;
|
||||
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5].pipeline; break;
|
||||
case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6].pipeline; break;
|
||||
case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7].pipeline; break;
|
||||
case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW_C4].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW_C4].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW_C4].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
bcast_row = true;
|
||||
} else {
|
||||
switch (dst->op) {
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
switch (n_fuse) {
|
||||
case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD ].pipeline; break;
|
||||
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_2].pipeline; break;
|
||||
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_3].pipeline; break;
|
||||
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_4].pipeline; break;
|
||||
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_5].pipeline; break;
|
||||
case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_6].pipeline; break;
|
||||
case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_7].pipeline; break;
|
||||
case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_8].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
|
||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
|
||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
if (n_fuse > 1) {
|
||||
id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_src1 offset:0 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
|
||||
if (bcast_row) {
|
||||
@@ -2132,7 +2279,11 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} else {
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
int nth = 32;
|
||||
|
||||
while (16*nth < ne0 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
@@ -2257,12 +2408,13 @@ static bool ggml_metal_encode_node(
|
||||
/*.nb2 =*/ pnb2,
|
||||
/*.nb3 =*/ pnb3,
|
||||
/*.offs =*/ offs,
|
||||
/*.o1 =*/ { offs_src1},
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_src1 offset:0 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
|
||||
@@ -2764,7 +2916,7 @@ static bool ggml_metal_encode_node(
|
||||
id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
|
||||
if (!h_src0) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
offs_src0 = 0;
|
||||
@@ -3640,7 +3792,7 @@ static bool ggml_metal_encode_node(
|
||||
id<MTLBuffer> h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1);
|
||||
if (!h_src1) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const int64_t neh0 = ne0;
|
||||
@@ -3656,7 +3808,7 @@ static bool ggml_metal_encode_node(
|
||||
id<MTLBuffer> h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst);
|
||||
if (!h_dst) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// tokens per expert
|
||||
@@ -3664,7 +3816,7 @@ static bool ggml_metal_encode_node(
|
||||
id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
|
||||
if (!h_tpe) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// id map
|
||||
@@ -3673,7 +3825,7 @@ static bool ggml_metal_encode_node(
|
||||
id<MTLBuffer> h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids);
|
||||
if (!h_ids) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
{
|
||||
@@ -4105,12 +4257,95 @@ static bool ggml_metal_encode_node(
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(src0));
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
|
||||
ggml_metal_kargs_rms_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne00_4 =*/ ne00/4,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.eps =*/ eps,
|
||||
/*.nef1 =*/ { ne01 },
|
||||
/*.nef2 =*/ { ne02 },
|
||||
/*.nef3 =*/ { ne03 },
|
||||
/*.nbf1 =*/ { nb01 },
|
||||
/*.nbf2 =*/ { nb02 },
|
||||
/*.nbf3 =*/ { nb03 },
|
||||
};
|
||||
|
||||
size_t offs_fuse[2] = { 0, 0 };
|
||||
id<MTLBuffer> id_fuse[2] = { id_src0, id_src0 };
|
||||
|
||||
// d[0] = rms_norm(a)
|
||||
// d[1] = mul(d[0], b)
|
||||
// d[2] = add(d[1], c)
|
||||
if (ctx_dev->use_fusion) {
|
||||
ops[0] = GGML_OP_RMS_NORM;
|
||||
ops[1] = GGML_OP_MUL;
|
||||
ops[2] = GGML_OP_ADD;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 1; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (nodes[n_fuse] != nodes[n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (nodes[n_fuse + 1]->src[1]->ne[0] != node->ne[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous_rows(nodes[n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (nodes[n_fuse + 1]->type != GGML_TYPE_F32) {
|
||||
break;
|
||||
}
|
||||
|
||||
ctx_dev->fuse_cnt[nodes[n_fuse + 1]->op]++;
|
||||
|
||||
id_fuse[n_fuse] = ggml_metal_get_buffer(nodes[n_fuse + 1]->src[1], &offs_fuse[n_fuse]);
|
||||
|
||||
args.nef1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[1];
|
||||
args.nef2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[2];
|
||||
args.nef3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->ne[3];
|
||||
|
||||
args.nbf1[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[1];
|
||||
args.nbf2[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[2];
|
||||
args.nbf3[n_fuse + 1] = nodes[n_fuse + 1]->src[1]->nb[3];
|
||||
}
|
||||
|
||||
++n_fuse;
|
||||
|
||||
if (ctx_dev->debug_fusion > 1 && n_fuse > 1) {
|
||||
if (n_fuse == 2) {
|
||||
GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL\n", __func__);
|
||||
}
|
||||
if (n_fuse == 3) {
|
||||
GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL + ADD\n", __func__);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_fuse > 1) {
|
||||
id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
|
||||
}
|
||||
|
||||
id<MTLComputePipelineState> pipeline;
|
||||
|
||||
switch (n_fuse) {
|
||||
case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM ].pipeline; break;
|
||||
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL ].pipeline; break;
|
||||
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD].pipeline; break;
|
||||
default: GGML_ABORT("unsupported n_fuse = %d\n", n_fuse);
|
||||
}
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
|
||||
@@ -4121,23 +4356,16 @@ static bool ggml_metal_encode_node(
|
||||
nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
|
||||
nth = MIN(nth, ne00/4);
|
||||
|
||||
ggml_metal_kargs_rms_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne00_4 =*/ ne00/4,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.eps =*/ eps,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_fuse[0] offset:offs_fuse[0] atIndex:2];
|
||||
[encoder setBuffer:id_fuse[1] offset:offs_fuse[1] atIndex:3];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
||||
|
||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_L2_NORM:
|
||||
{
|
||||
@@ -5532,7 +5760,7 @@ static bool ggml_metal_encode_node(
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return n_fuse;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_metal_graph_compute(
|
||||
@@ -6038,20 +6266,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
|
||||
ggml_metal_mem_pool_reset(mem_pool);
|
||||
|
||||
for (int idx = node_start; idx < node_end; ++idx) {
|
||||
for (int idx = node_start; idx < node_end;) {
|
||||
if (should_capture) {
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
|
||||
const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
|
||||
const int res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
|
||||
|
||||
if (should_capture) {
|
||||
[encoder popDebugGroup];
|
||||
}
|
||||
|
||||
if (!res) {
|
||||
if (res == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
idx += res;
|
||||
}
|
||||
|
||||
[encoder endEncoding];
|
||||
|
||||
@@ -832,7 +832,8 @@ enum ggml_sort_order {
|
||||
// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across all dims
|
||||
// cons: not very efficient
|
||||
kernel void kernel_add(
|
||||
template <int F>
|
||||
kernel void kernel_add_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -848,16 +849,39 @@ kernel void kernel_add(
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
|
||||
device float * dst_ptr = (device float *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs);
|
||||
|
||||
device const float * src1_ptr[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_ptr[j] = (device const float *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
||||
}
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
*((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) + *((device float *)(src1_ptr + i10*args.nb10));
|
||||
|
||||
float res = src0_ptr[i0];
|
||||
|
||||
#pragma unroll
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res += src1_ptr[j][i10];
|
||||
}
|
||||
|
||||
dst_ptr[i0] = res;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t;
|
||||
|
||||
template [[host_name("kernel_add")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>;
|
||||
template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>;
|
||||
template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>;
|
||||
template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>;
|
||||
template [[host_name("kernel_add_fuse_5")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<5>;
|
||||
template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<6>;
|
||||
template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>;
|
||||
template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>;
|
||||
|
||||
kernel void kernel_sub(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
@@ -875,7 +899,7 @@ kernel void kernel_sub(
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
@@ -900,9 +924,9 @@ kernel void kernel_mul(
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1;
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
@@ -926,9 +950,9 @@ kernel void kernel_div(
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1;
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
|
||||
device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
|
||||
device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = i0%args.ne10;
|
||||
@@ -970,46 +994,145 @@ template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_add_row(
|
||||
template <short F>
|
||||
kernel void kernel_add_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
|
||||
const uint nb = args.ne00/4;
|
||||
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
device const float4 * src1_row[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
|
||||
}
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res += src1_row[j][i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
kernel void kernel_sub_row(
|
||||
typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_add_row_c4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_5")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<5>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_6")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<6>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_7")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<7>;
|
||||
template [[host_name("kernel_add_row_c4_fuse_8")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<8>;
|
||||
|
||||
template <short F>
|
||||
kernel void kernel_sub_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
|
||||
const uint nb = args.ne00/4;
|
||||
dst[tpig] = src0[tpig] - src1[tpig % nb];
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
device const float4 * src1_row[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
|
||||
}
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res -= src1_row[j][i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
kernel void kernel_mul_row(
|
||||
typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_sub_row_c4")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>;
|
||||
|
||||
template <short F>
|
||||
kernel void kernel_mul_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
|
||||
const uint nb = args.ne00/4;
|
||||
dst[tpig] = src0[tpig] * src1[tpig % nb];
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
device const float4 * src1_row[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
|
||||
}
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res *= src1_row[j][i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
kernel void kernel_div_row(
|
||||
typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_mul_row_c4")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>;
|
||||
|
||||
template <short F>
|
||||
kernel void kernel_div_row_c4_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const float4 * src0,
|
||||
device const float4 * src1,
|
||||
device float4 * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
|
||||
const uint nb = args.ne00/4;
|
||||
dst[tpig] = src0[tpig] / src1[tpig % nb];
|
||||
const uint i = tpig % nb;
|
||||
|
||||
device const float4 * src0_row = (device const float4 *) (src0);
|
||||
device float4 * dst_row = (device float4 *) (dst);
|
||||
|
||||
device const float4 * src1_row[F];
|
||||
for (short j = 0; j < F; ++j) {
|
||||
src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
|
||||
}
|
||||
|
||||
float4 res = src0_row[tpig];
|
||||
|
||||
#pragma unroll(F)
|
||||
for (short j = 0; j < F; ++j) {
|
||||
res /= src1_row[j][i];
|
||||
}
|
||||
|
||||
dst_row[tpig] = res;
|
||||
}
|
||||
|
||||
typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t;
|
||||
|
||||
template [[host_name("kernel_div_row_c4")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>;
|
||||
|
||||
kernel void kernel_scale(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
@@ -2116,26 +2239,39 @@ kernel void kernel_norm(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_rms_norm(
|
||||
// F == 1 : rms_norm (no fuse)
|
||||
// F == 2 : rms_norm + mul
|
||||
// F == 3 : rms_norm + mul + add
|
||||
template <short F>
|
||||
kernel void kernel_rms_norm_fuse_impl(
|
||||
constant ggml_metal_kargs_rms_norm & args,
|
||||
device const char * src0,
|
||||
device const char * src1_0,
|
||||
device const char * src1_1,
|
||||
device char * dst,
|
||||
threadgroup float * shmem_f32 [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort ntg[[threads_per_threadgroup]]) {
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
if (sgitg == 0) {
|
||||
shmem_f32[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
|
||||
const int i01 = tgpig.x;
|
||||
const int i02 = tgpig.y;
|
||||
const int i03 = tgpig.z;
|
||||
|
||||
device const float4 * x = (device const float4 *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
|
||||
device const float4 * f0 = (device const float4 *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const float4 * f1 = (device const float4 *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
|
||||
sumf += dot(x[i00], x[i00]);
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
@@ -2154,12 +2290,26 @@ kernel void kernel_rms_norm(
|
||||
const float mean = sumf/args.ne00;
|
||||
const float scale = 1.0f/sqrt(mean + args.eps);
|
||||
|
||||
device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
y[i00] = x[i00] * scale;
|
||||
device float4 * y = (device float4 *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
|
||||
if (F == 1) {
|
||||
y[i00] = (x[i00]*scale);
|
||||
}
|
||||
if (F == 2) {
|
||||
y[i00] = (x[i00]*scale)*f0[i00];
|
||||
}
|
||||
if (F == 3) {
|
||||
y[i00] = (x[i00]*scale)*f0[i00] + f1[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_rms_norm_fuse_impl<1>) kernel_rms_norm_fuse_t;
|
||||
|
||||
template [[host_name("kernel_rms_norm")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<1>;
|
||||
template [[host_name("kernel_rms_norm_mul")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<3>;
|
||||
|
||||
kernel void kernel_l2_norm(
|
||||
constant ggml_metal_kargs_l2_norm & args,
|
||||
device const char * src0,
|
||||
|
||||
@@ -765,8 +765,8 @@ void write_output_files() {
|
||||
len += "};\n";
|
||||
}
|
||||
}
|
||||
fprintf(src, data.c_str());
|
||||
fprintf(src, len.c_str());
|
||||
fputs(data.c_str(), src);
|
||||
fputs(len.c_str(), src);
|
||||
}
|
||||
fclose(hdr);
|
||||
fclose(src);
|
||||
|
||||
@@ -354,6 +354,7 @@ class MODEL_ARCH(IntEnum):
|
||||
JAIS = auto()
|
||||
NEMOTRON = auto()
|
||||
EXAONE = auto()
|
||||
EXAONE4 = auto()
|
||||
GRANITE = auto()
|
||||
GRANITE_MOE = auto()
|
||||
GRANITE_HYBRID = auto()
|
||||
@@ -671,6 +672,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.JAIS: "jais",
|
||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||
MODEL_ARCH.EXAONE: "exaone",
|
||||
MODEL_ARCH.EXAONE4: "exaone4",
|
||||
MODEL_ARCH.GRANITE: "granite",
|
||||
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
||||
MODEL_ARCH.GRANITE_HYBRID: "granitehybrid",
|
||||
@@ -2197,6 +2199,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.EXAONE4: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.GRANITE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -1 +1 @@
|
||||
d62df60a07ba3deeb85e5cfc9b1ee07645ff35e2
|
||||
3323219cd3cc050e5c7133cd4fc1e50d1f590faf
|
||||
|
||||
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
||||
{ LLM_ARCH_RWKV7, "rwkv7" },
|
||||
@@ -1510,6 +1511,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_EXAONE4,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_RWKV6,
|
||||
{
|
||||
|
||||
@@ -72,6 +72,7 @@ enum llm_arch {
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_EXAONE4,
|
||||
LLM_ARCH_RWKV6,
|
||||
LLM_ARCH_RWKV6QWEN2,
|
||||
LLM_ARCH_RWKV7,
|
||||
|
||||
@@ -56,6 +56,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||
@@ -168,6 +169,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
} else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
|
||||
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
||||
if (tmpl_contains("[|tool|]")) {
|
||||
return LLM_CHAT_TEMPLATE_EXAONE_4;
|
||||
}
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
// EXAONE-3.0-7.8B-Instruct
|
||||
return LLM_CHAT_TEMPLATE_EXAONE_3;
|
||||
@@ -532,6 +536,22 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "[|assistant|]";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_4) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
|
||||
} else if (role == "user") {
|
||||
ss << "[|user|]" << trim(message->content) << "\n";
|
||||
} else if (role == "assistant") {
|
||||
ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
|
||||
} else if (role == "tool") {
|
||||
ss << "[|tool|]" << trim(message->content) << "[|endofturn|]\n";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "[|assistant|]";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
|
||||
// this template requires the model to have "\n\n" as EOT token
|
||||
for (size_t i = 0; i < chat.size(); i++) {
|
||||
|
||||
@@ -35,6 +35,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||
LLM_CHAT_TEMPLATE_MINICPM,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_4,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
LLM_CHAT_TEMPLATE_GRANITE,
|
||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||
|
||||
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
//
|
||||
|
||||
uint32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
|
||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
}
|
||||
|
||||
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
|
||||
+21
-11
@@ -428,6 +428,8 @@ void llm_graph_result::reset() {
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
|
||||
params = {};
|
||||
|
||||
inputs.clear();
|
||||
|
||||
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
@@ -905,20 +907,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
// aggregate experts
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
for (int i = 0; i < n_expert_used; ++i) {
|
||||
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
|
||||
experts->nb[2], i*experts->nb[1]);
|
||||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
||||
|
||||
if (i == 0) {
|
||||
moe_out = cur_expert;
|
||||
} else {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
||||
}
|
||||
assert(n_expert_used > 0);
|
||||
|
||||
// order the views before the adds
|
||||
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
||||
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
||||
|
||||
ggml_build_forward_expand(gf, cur_experts[i]);
|
||||
}
|
||||
|
||||
if (n_expert_used == 1) {
|
||||
// aggregate experts
|
||||
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
||||
// to avoid potentially a large number of add nodes during warmup
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
||||
ggml_tensor * moe_out = cur_experts[0];
|
||||
|
||||
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
||||
}
|
||||
|
||||
if (hparams.n_expert_used == 1) {
|
||||
// avoid returning a non-contiguous tensor
|
||||
moe_out = ggml_cont(ctx0, moe_out);
|
||||
}
|
||||
|
||||
@@ -1490,6 +1490,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_EXAONE4:
|
||||
{
|
||||
if (hparams.n_layer == 64) { // 32B
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
hparams.set_swa_pattern(4);
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 30: type = LLM_TYPE_1_2B; break;
|
||||
case 64: type = LLM_TYPE_32B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_RWKV6:
|
||||
case LLM_ARCH_RWKV6QWEN2:
|
||||
{
|
||||
@@ -4355,6 +4372,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_EXAONE4:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_RWKV6:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -13478,6 +13528,142 @@ struct llm_build_exaone : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
template <bool iswa>
|
||||
struct llm_build_exaone4 : public llm_graph_context {
|
||||
llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_unified_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv_unified();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// use RoPE for SWA layers or non-SWA models
|
||||
const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
if (use_rope) {
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.layers[il].attn_post_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = build_ffn(ffn_inp,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
const llama_model & model;
|
||||
|
||||
@@ -17163,6 +17349,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_exaone>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_EXAONE4:
|
||||
{
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||
llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_RWKV6:
|
||||
{
|
||||
llm = std::make_unique<llm_build_rwkv6>(*this, params);
|
||||
@@ -17430,6 +17624,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_NEMOTRON:
|
||||
case LLM_ARCH_EXAONE:
|
||||
case LLM_ARCH_EXAONE4:
|
||||
case LLM_ARCH_MINICPM3:
|
||||
case LLM_ARCH_DOTS1:
|
||||
case LLM_ARCH_HUNYUAN_MOE:
|
||||
|
||||
@@ -1925,6 +1925,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (
|
||||
tokenizer_pre == "exaone") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
||||
} else if (
|
||||
tokenizer_pre == "exaone4") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "chameleon") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
||||
|
||||
+46
-20
@@ -2353,9 +2353,12 @@ struct test_bin_bcast : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int, 4> nr;
|
||||
int nf; // number of fused ops, nf == 1 -> single op (no fusion)
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, nr);
|
||||
return VARS_TO_STR4(type, ne, nr, nf);
|
||||
}
|
||||
|
||||
size_t op_size(ggml_tensor * t) override {
|
||||
@@ -2364,24 +2367,35 @@ struct test_bin_bcast : public test_case {
|
||||
|
||||
test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 1, 1},
|
||||
std::array<int, 4> nr = {1, 2, 1, 1})
|
||||
: op(op), type(type), ne(ne), nr(nr) {}
|
||||
std::array<int, 4> nr = {1, 2, 1, 1},
|
||||
int nf = 1)
|
||||
: op(op), type(type), ne(ne), nr(nr), nf(nf) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
GGML_ASSERT(nf <= 8);
|
||||
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_name(b, "b");
|
||||
|
||||
// The backward pass supports broadcasting only for GGML_ADD:
|
||||
const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
|
||||
if (grad_supported) {
|
||||
ggml_set_param(a);
|
||||
ggml_set_param(b);
|
||||
ggml_tensor * b[8];
|
||||
for (int i = 0; i < nf; ++i) {
|
||||
b[i] = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_name(b[i], (std::string("b") + std::to_string(i)).c_str());
|
||||
}
|
||||
|
||||
// The backward pass supports broadcasting only for GGML_ADD:
|
||||
const bool grad_supported = op == ggml_add && ggml_are_same_shape(a, b[0]) && nf == 1;
|
||||
if (grad_supported) {
|
||||
ggml_set_param(a);
|
||||
ggml_set_param(b[0]);
|
||||
}
|
||||
|
||||
ggml_tensor * out = a;
|
||||
|
||||
for (int i = 0; i < nf; ++i) {
|
||||
out = op(ctx, out, b[i]);
|
||||
}
|
||||
|
||||
ggml_tensor * out = op(ctx, a, b);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
@@ -2622,15 +2636,15 @@ struct test_rms_norm_back : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_RMS_NORM + GGML_OP_MUL
|
||||
struct test_rms_norm_mul : public test_case {
|
||||
// GGML_OP_RMS_NORM + GGML_OP_MUL + GGML_OP_ADD
|
||||
struct test_rms_norm_mul_add : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float eps;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "RMS_NORM_MUL";
|
||||
return "RMS_NORM_MUL_ADD";
|
||||
}
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
@@ -2639,7 +2653,7 @@ struct test_rms_norm_mul : public test_case {
|
||||
return VARS_TO_STR3(type, ne, eps);
|
||||
}
|
||||
|
||||
test_rms_norm_mul(ggml_type type = GGML_TYPE_F32,
|
||||
test_rms_norm_mul_add(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {64, 5, 4, 3},
|
||||
float eps = 1e-6f)
|
||||
: type(type), ne(ne), eps(eps) {}
|
||||
@@ -2647,14 +2661,17 @@ struct test_rms_norm_mul : public test_case {
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * c = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_param(a);
|
||||
ggml_set_name(a, "a");
|
||||
ggml_set_param(b);
|
||||
ggml_set_name(b, "b");
|
||||
ggml_set_param(c);
|
||||
ggml_set_name(c, "c");
|
||||
|
||||
// Use a and b early, so we don't end up with an OP_NONE between rms_norm and mul
|
||||
a = ggml_add(ctx, a, b);
|
||||
ggml_tensor * out = ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b);
|
||||
// Use a, b and c early, so we don't end up with an OP_NONE between rms_norm and mul
|
||||
a = ggml_add(ctx, ggml_add(ctx, a, b), c);
|
||||
ggml_tensor * out = ggml_add(ctx, ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b), c);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
@@ -5151,6 +5168,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
//add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
|
||||
}
|
||||
|
||||
// fusion
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}, 2));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 2, 1, 1}, 3));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}, 4));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 2}, 5));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2}, 6));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2}, 7));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {2, 2, 2, 2}, 8));
|
||||
|
||||
test_cases.emplace_back(new test_add1());
|
||||
test_cases.emplace_back(new test_scale());
|
||||
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
|
||||
@@ -5165,7 +5191,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
}
|
||||
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
|
||||
test_cases.emplace_back(new test_rms_norm_mul(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
|
||||
|
||||
Reference in New Issue
Block a user