mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 10:07:44 +02:00
Compare commits
19 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9b26511857 | |||
| 00217cd413 | |||
| 3b337b01a1 | |||
| a86a580a66 | |||
| 0f7c69689f | |||
| 835b2b915c | |||
| b05a9d650f | |||
| 27052978e4 | |||
| 077c94d0ca | |||
| aa3ee0eb0b | |||
| d0991da39d | |||
| aa719c2f88 | |||
| 4cdd0bb453 | |||
| b5bd037832 | |||
| dfcd53f7ec | |||
| 4ea00794b8 | |||
| 02a6a82ae7 | |||
| c498fc82fe | |||
| e7a5130a20 |
@@ -1,10 +1,10 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc4.2.0
|
||||
ARG MUSA_VERSION=rc4.3.0
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
|
||||
ARG BASE_MUSA_DEV_CONTAINER=sh-harbor.mthreads.com/haive/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||
ARG BASE_MUSA_RUN_CONTAINER=sh-harbor.mthreads.com/haive/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
|
||||
@@ -475,7 +475,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
@@ -68,22 +68,19 @@ jobs:
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Determine tag name
|
||||
- name: Determine source tag name
|
||||
id: srctag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Determine image tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
||||
REPO_NAME="${{ github.event.repository.name }}"
|
||||
|
||||
# determine tag name postfix (build number, commit hash)
|
||||
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
|
||||
TAG_POSTFIX="-b${BUILD_NUMBER}"
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
||||
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
||||
fi
|
||||
# list all tags possible
|
||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
||||
TYPE=""
|
||||
@@ -91,9 +88,9 @@ jobs:
|
||||
TYPE="-${{ matrix.config.tag }}"
|
||||
fi
|
||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
||||
@@ -101,7 +98,6 @@ jobs:
|
||||
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
||||
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
||||
env:
|
||||
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
@@ -177,3 +173,29 @@ jobs:
|
||||
# return to this if the experimental github cache is having issues
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
|
||||
create_tag:
|
||||
name: Create and push git tag
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine source tag name
|
||||
id: srctag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Create and push git tag
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
git tag ${{ steps.srctag.outputs.name }} || exit 0
|
||||
git push origin ${{ steps.srctag.outputs.name }} || exit 0
|
||||
|
||||
+2
-1
@@ -63,7 +63,7 @@
|
||||
/ggml/src/ggml-quants.* @ggerganov
|
||||
/ggml/src/ggml-threading.* @ggerganov @slaren
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
/ggml/src/ggml-zdnn/ @taronaeo @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml.c @ggerganov @slaren
|
||||
/ggml/src/ggml.cpp @ggerganov @slaren
|
||||
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
|
||||
@@ -103,4 +103,5 @@
|
||||
/LICENSE @ggerganov
|
||||
/README.md @ggerganov
|
||||
/SECURITY.md @ggerganov
|
||||
/build-xcframework.sh @danbev
|
||||
requirements*.txt @CISC
|
||||
|
||||
+1
-1
@@ -25,7 +25,7 @@ The project differentiates between 3 levels of contributors:
|
||||
- Squash-merge PRs
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Let other maintainers, merge their own PRs
|
||||
- Let other maintainers merge their own PRs
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
|
||||
@@ -178,6 +178,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||
- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
|
||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
|
||||
|
||||
+1
-1
@@ -21,7 +21,7 @@ docker run --privileged -it \
|
||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||
-v $PWD:/ws -w /ws \
|
||||
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
|
||||
```
|
||||
|
||||
Inside the container, execute the following commands:
|
||||
|
||||
+3
-5
@@ -961,15 +961,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
if (!has_eos && !has_sep && !has_rerank_prompt) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
|
||||
+1
-1
@@ -738,7 +738,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
// MoE utils
|
||||
//
|
||||
|
||||
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
|
||||
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||
|
||||
static std::string llm_ffn_exps_block_regex(int idx) {
|
||||
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
||||
|
||||
@@ -3717,11 +3717,29 @@ class Qwen2MoeModel(TextModel):
|
||||
class Qwen3Model(Qwen2Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||
|
||||
# extra logic for rerank models
|
||||
is_rerank: bool = False
|
||||
is_tied_embeddings: bool = False
|
||||
token_false_id: int | None = None
|
||||
token_true_id: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# track for intern-s1-mini
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# a bit hacky, but currently the only way to detect if this is a rerank model
|
||||
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
|
||||
readme_path = self.dir_model / "README.md"
|
||||
readme_text = ""
|
||||
if readme_path.exists():
|
||||
with readme_path.open("r", encoding="utf-8") as f:
|
||||
readme_text = f.read()
|
||||
if "# Qwen3-Reranker" in readme_text:
|
||||
self._find_rerank_config()
|
||||
|
||||
def set_vocab(self):
|
||||
# deal with intern-s1-mini
|
||||
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
|
||||
@@ -3730,6 +3748,53 @@ class Qwen3Model(Qwen2Model):
|
||||
|
||||
super().set_vocab()
|
||||
|
||||
def _find_rerank_config(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
|
||||
self.is_rerank = True
|
||||
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
|
||||
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
|
||||
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
|
||||
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
|
||||
|
||||
assert self.token_false_id is not None and self.token_true_id is not None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if self.is_rerank:
|
||||
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
|
||||
self.gguf_writer.add_classifier_output_labels(["yes", "no"])
|
||||
self.gguf_writer.add_chat_template([{
|
||||
"name": "rerank",
|
||||
"template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
|
||||
"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||
}])
|
||||
|
||||
def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
|
||||
# extract "yes" and "no" tokens from the output lm_head tensor
|
||||
false_row = data_torch[self.token_false_id]
|
||||
true_row = data_torch[self.token_true_id]
|
||||
return torch.stack([true_row, false_row], dim=0)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self.is_rerank:
|
||||
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
|
||||
is_real_head = not self.is_tied_embeddings and "lm_head" in name
|
||||
if is_tied_head or is_real_head:
|
||||
cls_out_head = (
|
||||
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
|
||||
self._get_cls_out_tensor(data_torch),
|
||||
)
|
||||
if is_tied_head:
|
||||
embed = (self.map_tensor_name(name), data_torch)
|
||||
return [cls_out_head, embed]
|
||||
if is_real_head:
|
||||
return [cls_out_head]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3MoeForCausalLM")
|
||||
class Qwen3MoeModel(Qwen2MoeModel):
|
||||
@@ -7930,6 +7995,121 @@ class BailingMoeModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
|
||||
class GroveMoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GROVEMOE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
|
||||
self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
|
||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
|
||||
self.gguf_writer.add_experts_per_group(2)
|
||||
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
|
||||
self.gguf_writer.add_expert_group_scale(0.05)
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
_chunk_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.endswith(".expert_bias"):
|
||||
# FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
|
||||
return []
|
||||
|
||||
# process the experts separately
|
||||
if name.find("chunk_experts") != -1:
|
||||
n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
|
||||
assert bid is not None
|
||||
|
||||
if self._chunk_experts is None:
|
||||
self._chunk_experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._chunk_experts[bid][name] = data_torch
|
||||
|
||||
if len(self._chunk_experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._chunk_experts[bid][ename])
|
||||
del self._chunk_experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
elif name.find("experts") != -1:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._chunk_experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
|
||||
if len(chunk_experts) > 0:
|
||||
raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("ChameleonForConditionalGeneration")
|
||||
@ModelBase.register("ChameleonForCausalLM") # obsolete
|
||||
class ChameleonModel(TextModel):
|
||||
|
||||
+1
-1
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `MUSA_VERSION` set to `rc4.2.0`
|
||||
- `MUSA_VERSION` set to `rc4.3.0`
|
||||
|
||||
The resulting images, are essentially the same as the non-MUSA images:
|
||||
|
||||
|
||||
@@ -95,8 +95,13 @@ int main(int argc, char ** argv) {
|
||||
params.n_batch = params.n_ctx;
|
||||
}
|
||||
|
||||
// For non-causal models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
// for non-causal models, batch size must be equal to ubatch size
|
||||
if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
|
||||
params.n_ubatch = params.n_batch;
|
||||
}
|
||||
|
||||
// get max number of sequences per batch
|
||||
const int n_seq_max = llama_max_parallel_sequences();
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -144,6 +149,7 @@ int main(int argc, char ** argv) {
|
||||
// get added sep and eos token, if any
|
||||
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
|
||||
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
|
||||
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
@@ -153,21 +159,28 @@ int main(int argc, char ** argv) {
|
||||
// split classification pairs and insert expected separator tokens
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
|
||||
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
|
||||
std::string final_prompt;
|
||||
|
||||
for (size_t i = 0; i < pairs.size(); i++) {
|
||||
final_prompt += pairs[i];
|
||||
if (i != pairs.size() - 1) {
|
||||
if (!added_eos_token.empty()) {
|
||||
final_prompt += added_eos_token;
|
||||
}
|
||||
if (!added_sep_token.empty()) {
|
||||
final_prompt += added_sep_token;
|
||||
if (rerank_prompt != nullptr) {
|
||||
const std::string query = pairs[0];
|
||||
const std::string doc = pairs[1];
|
||||
std::string final_prompt = rerank_prompt;
|
||||
string_replace_all(final_prompt, "{query}" , query);
|
||||
string_replace_all(final_prompt, "{document}", doc );
|
||||
inp = common_tokenize(vocab, final_prompt, true, true);
|
||||
} else {
|
||||
std::string final_prompt;
|
||||
for (size_t i = 0; i < pairs.size(); i++) {
|
||||
final_prompt += pairs[i];
|
||||
if (i != pairs.size() - 1) {
|
||||
if (!added_eos_token.empty()) {
|
||||
final_prompt += added_eos_token;
|
||||
}
|
||||
if (!added_sep_token.empty()) {
|
||||
final_prompt += added_sep_token;
|
||||
}
|
||||
}
|
||||
}
|
||||
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||
}
|
||||
|
||||
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||
} else {
|
||||
inp = common_tokenize(ctx, prompt, true, true);
|
||||
}
|
||||
@@ -229,7 +242,7 @@ int main(int argc, char ** argv) {
|
||||
const uint64_t n_toks = inp.size();
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
|
||||
float * out = emb + e * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||
|
||||
@@ -118,13 +118,17 @@ embedding-convert-model:
|
||||
|
||||
embedding-run-original-model:
|
||||
$(call validate_embedding_model_path,embedding-run-original-model)
|
||||
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
|
||||
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
|
||||
./scripts/embedding/run-original-model.py \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-run-converted-model:
|
||||
@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
|
||||
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
|
||||
@./scripts/embedding/compare-embeddings-logits.sh
|
||||
@./scripts/embedding/compare-embeddings-logits.sh \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
embedding-inspect-original-model:
|
||||
$(call validate_embedding_model_path,embedding-inspect-original-model)
|
||||
@@ -156,7 +160,8 @@ embedding-quantize-model:
|
||||
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
|
||||
|
||||
embedding-run-quantized-model:
|
||||
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
|
||||
@./scripts/embedding/run-converted-model.sh $(QUANTIZED_EMBEDDING_MODEL) \
|
||||
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
|
||||
|
||||
###
|
||||
### Perplexity targets/recipes
|
||||
|
||||
@@ -151,6 +151,35 @@ int main(int argc, char ** argv) {
|
||||
logits = llama_get_embeddings(ctx);
|
||||
n_logits = llama_model_n_embd(model) * batch.n_tokens;
|
||||
type = "-embeddings";
|
||||
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
const int n_embd_count = batch.n_tokens;
|
||||
|
||||
printf("Embedding dimension: %d\n", n_embd);
|
||||
printf("\n");
|
||||
|
||||
// Print embeddings in the specified format
|
||||
for (int j = 0; j < n_embd_count; j++) {
|
||||
printf("embedding %d: ", j);
|
||||
|
||||
// Print first 3 values
|
||||
for (int i = 0; i < 3 && i < n_embd; i++) {
|
||||
printf("%9.6f ", logits[j * n_embd + i]);
|
||||
}
|
||||
|
||||
printf(" ... ");
|
||||
|
||||
// Print last 3 values
|
||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||
if (i >= 0) {
|
||||
printf("%9.6f ", logits[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Embeddings size: %d\n", n_logits);
|
||||
} else {
|
||||
logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
||||
@@ -183,22 +212,23 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
for (int i = 0; i < n_logits; i++) {
|
||||
fprintf(f, "%d: %.6f\n", i, logits[i]); // Added index and changed format
|
||||
fprintf(f, "%d: %.6f\n", i, logits[i]);
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
// Print first and last 10 logits for quick verification
|
||||
printf("First 10 logits: ");
|
||||
for (int i = 0; i < 10 && i < n_logits; i++) {
|
||||
printf("%.6f ", logits[i]);
|
||||
}
|
||||
printf("\n");
|
||||
if (!embedding_mode) {
|
||||
printf("First 10 logits: ");
|
||||
for (int i = 0; i < 10 && i < n_logits; i++) {
|
||||
printf("%.6f ", logits[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("Last 10 logits: ");
|
||||
for (int i = n_logits - 10; i < n_logits; i++) {
|
||||
if (i >= 0) printf("%.6f ", logits[i]);
|
||||
printf("Last 10 logits: ");
|
||||
for (int i = n_logits - 10; i < n_logits; i++) {
|
||||
if (i >= 0) printf("%.6f ", logits[i]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
printf("Logits saved to %s\n", bin_filename);
|
||||
printf("Logits saved to %s\n", txt_filename);
|
||||
|
||||
@@ -2,8 +2,37 @@
|
||||
|
||||
set -e
|
||||
|
||||
MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
|
||||
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
|
||||
# Parse command line arguments
|
||||
MODEL_PATH=""
|
||||
MODEL_NAME=""
|
||||
PROMPTS_FILE=""
|
||||
|
||||
# First argument is always model path
|
||||
if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
|
||||
MODEL_PATH="$1"
|
||||
shift
|
||||
fi
|
||||
|
||||
# Parse remaining arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--prompts-file|-pf)
|
||||
PROMPTS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
# If MODEL_NAME not set and this isn't a flag, use as model name
|
||||
if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
|
||||
MODEL_NAME="$1"
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Set defaults
|
||||
MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
|
||||
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
|
||||
|
||||
if [ -t 0 ]; then
|
||||
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
|
||||
@@ -35,8 +64,18 @@ with open('$TEMP_FILE', 'wb') as f:
|
||||
trap "rm -f $TEMP_FILE" EXIT
|
||||
fi
|
||||
|
||||
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
|
||||
# Build the semantic_check.py command
|
||||
SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
|
||||
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
|
||||
--cpp-embeddings $CPP_EMBEDDINGS \
|
||||
--prompt "Hello world today"
|
||||
--cpp-embeddings $CPP_EMBEDDINGS"
|
||||
|
||||
# Add prompts file if specified, otherwise use default prompt
|
||||
if [ -n "$PROMPTS_FILE" ]; then
|
||||
SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
|
||||
else
|
||||
SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
|
||||
fi
|
||||
|
||||
# Execute the command
|
||||
eval $SEMANTIC_CMD
|
||||
|
||||
|
||||
@@ -2,8 +2,27 @@
|
||||
|
||||
set -e
|
||||
|
||||
# First try command line argument, then environment variable, then file
|
||||
CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||
# Parse command line arguments
|
||||
CONVERTED_MODEL=""
|
||||
PROMPTS_FILE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-p|--prompts-file)
|
||||
PROMPTS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
CONVERTED_MODEL="$1"
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# First try command line argument, then environment variable
|
||||
CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
|
||||
|
||||
# Final check if we have a model path
|
||||
if [ -z "$CONVERTED_MODEL" ]; then
|
||||
@@ -13,8 +32,19 @@ if [ -z "$CONVERTED_MODEL" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read prompt from file or use default
|
||||
if [ -n "$PROMPTS_FILE" ]; then
|
||||
if [ ! -f "$PROMPTS_FILE" ]; then
|
||||
echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
PROMPT=$(cat "$PROMPTS_FILE")
|
||||
else
|
||||
PROMPT="Hello world today"
|
||||
fi
|
||||
|
||||
echo $CONVERTED_MODEL
|
||||
|
||||
cmake --build ../../build --target llama-logits -j8
|
||||
|
||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
|
||||
# TODO: update logits.cpp to accept a --file/-f option for the prompt
|
||||
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
|
||||
|
||||
@@ -13,14 +13,37 @@ unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process model with specified path')
|
||||
parser.add_argument('--model-path', '-m', help='Path to the model')
|
||||
parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
|
||||
args = parser.parse_args()
|
||||
|
||||
def read_prompt_from_file(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{file_path}' not found")
|
||||
exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
exit(1)
|
||||
|
||||
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
|
||||
if model_path is None:
|
||||
parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
|
||||
# This can be used to override the sliding window size for manual testing. This
|
||||
# can be useful to verify the sliding window attention mask in the original model
|
||||
# and compare it with the converted .gguf model.
|
||||
if hasattr(config, 'sliding_window'):
|
||||
original_sliding_window = config.sliding_window
|
||||
#original_sliding_window = 6
|
||||
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
|
||||
|
||||
print(f"Using unreleased model: {unreleased_model_name}")
|
||||
if unreleased_model_name:
|
||||
model_name_lower = unreleased_model_name.lower()
|
||||
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
|
||||
@@ -29,19 +52,28 @@ if unreleased_model_name:
|
||||
|
||||
try:
|
||||
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
|
||||
model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
|
||||
model = model_class.from_pretrained(model_path, config=config)
|
||||
except (ImportError, AttributeError) as e:
|
||||
print(f"Failed to import or load model: {e}")
|
||||
exit(1)
|
||||
else:
|
||||
model = AutoModel.from_pretrained(model_path)
|
||||
model = AutoModel.from_pretrained(model_path, config=config)
|
||||
print(f"Model class: {type(model)}")
|
||||
#print(f"Model file: {type(model).__module__}")
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
print(f"Model file: {type(model).__module__}")
|
||||
|
||||
# Verify the model is using the correct sliding window
|
||||
if hasattr(model.config, 'sliding_window'):
|
||||
print(f"Model's sliding_window: {model.config.sliding_window}")
|
||||
else:
|
||||
print("Model config does not have sliding_window attribute")
|
||||
|
||||
model_name = os.path.basename(model_path)
|
||||
|
||||
texts = [ "Hello world today" ]
|
||||
if args.prompts_file:
|
||||
prompt_text = read_prompt_from_file(args.prompts_file)
|
||||
texts = [prompt_text]
|
||||
else:
|
||||
texts = ["Hello world today"]
|
||||
|
||||
encoded = tokenizer(
|
||||
texts,
|
||||
|
||||
@@ -40,7 +40,7 @@ if os.path.exists(index_path):
|
||||
file_path = os.path.join(model_path, file_name)
|
||||
print(f"\n--- From {file_name} ---")
|
||||
|
||||
with safe_open(file_path, framework="pt") as f: # type: ignore
|
||||
with safe_open(file_path, framework="pt") as f:
|
||||
for tensor_name in sorted(tensor_names):
|
||||
tensor = f.get_tensor(tensor_name)
|
||||
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
|
||||
@@ -49,7 +49,7 @@ elif os.path.exists(single_file_path):
|
||||
# Single file model (original behavior)
|
||||
print("Single-file model detected")
|
||||
|
||||
with safe_open(single_file_path, framework="pt") as f: # type: ignore
|
||||
with safe_open(single_file_path, framework="pt") as f:
|
||||
keys = f.keys()
|
||||
print("Tensors in model:")
|
||||
for key in sorted(keys):
|
||||
|
||||
@@ -101,6 +101,17 @@ def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
|
||||
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
|
||||
}
|
||||
|
||||
def read_prompt_from_file(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Prompts file '{file_path}' not found")
|
||||
exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading prompts file: {e}")
|
||||
exit(1)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
|
||||
parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
|
||||
@@ -108,14 +119,20 @@ def main():
|
||||
parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
|
||||
parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
|
||||
parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
|
||||
parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.prompts_file:
|
||||
prompt = read_prompt_from_file(args.prompts_file)
|
||||
else:
|
||||
prompt = args.prompt
|
||||
|
||||
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
|
||||
print("=" * 70)
|
||||
|
||||
# Single prompt detailed comparison
|
||||
print(f"\nTesting with prompt: '{args.prompt}'")
|
||||
print(f"\nTesting with prompt: '{prompt}'")
|
||||
|
||||
# Load the python model to get configuration information and also to load the tokenizer.
|
||||
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
|
||||
@@ -144,7 +161,7 @@ def main():
|
||||
else:
|
||||
model = AutoModel.from_pretrained(args.model_path)
|
||||
|
||||
encoded = tokenizer(args.prompt, return_tensors="pt")
|
||||
encoded = tokenizer(prompt, return_tensors="pt")
|
||||
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
|
||||
n_tokens = len(tokens)
|
||||
print(f"n_tokens: {n_tokens}");
|
||||
@@ -155,7 +172,7 @@ def main():
|
||||
python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
|
||||
|
||||
# Run comparison
|
||||
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
|
||||
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
|
||||
|
||||
# Summary
|
||||
print(f"\n=== SUMMARY ===")
|
||||
|
||||
@@ -160,7 +160,6 @@
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
||||
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||
|
||||
return ((v4f32)res)[0];
|
||||
}
|
||||
|
||||
// multiply int8_t, add results pairwise twice
|
||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = __lsx_vsigncov_b(x, x);
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = __lsx_vsigncov_b(x, y);
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = lsx_maddubs_h(ax, sy);
|
||||
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
||||
return lsx_madd_h(ones, dot);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
|
||||
}
|
||||
}
|
||||
|
||||
// multiply int8_t, add results pairwise twice
|
||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = __lsx_vsigncov_b(x, x);
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = __lsx_vsigncov_b(x, y);
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = lsx_maddubs_h(ax, sy);
|
||||
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
||||
return lsx_madd_h(ones, dot);
|
||||
}
|
||||
|
||||
// horizontally add 8 floats
|
||||
static inline float hsum_float_8(const __m256 x) {
|
||||
__m128 res = lasx_extractf128(x, 1);
|
||||
|
||||
@@ -260,6 +260,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK_MXFP4 == 0);
|
||||
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
||||
|
||||
const int qk = QK_MXFP4;
|
||||
const int nb = n / qk;
|
||||
|
||||
const block_mxfp4 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
|
||||
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
||||
|
||||
float32x4_t v_acc = vec_splats(0.0f);
|
||||
|
||||
#pragma GCC unroll 8
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
||||
const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
|
||||
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
||||
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
||||
|
||||
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
|
||||
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
|
||||
|
||||
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
||||
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
||||
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
||||
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
||||
|
||||
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
|
||||
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
|
||||
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
|
||||
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
|
||||
|
||||
const int8x16_t v_y0l = vec_xl(0, y0->qs);
|
||||
const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
|
||||
const int8x16_t v_y1l = vec_xl(0, y1->qs);
|
||||
const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
|
||||
|
||||
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
|
||||
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
|
||||
|
||||
const float32x4_t v_xy0f = vec_float(v_xy0);
|
||||
const float32x4_t v_xy1f = vec_float(v_xy1);
|
||||
|
||||
const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
|
||||
v_acc = vec_madd(v_xy0f, v_d0, v_acc);
|
||||
v_acc = vec_madd(v_xy1f, v_d1, v_acc);
|
||||
}
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
||||
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
||||
|
||||
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
||||
|
||||
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
||||
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
||||
|
||||
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
|
||||
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
|
||||
|
||||
const int8x16_t v_yl = vec_xl(0, y0->qs);
|
||||
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
||||
|
||||
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
||||
const float32x4_t v_xyf = vec_float(v_xy);
|
||||
|
||||
const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
v_acc = vec_madd(v_xyf, v_d, v_acc);
|
||||
}
|
||||
|
||||
sumf = vec_hsum_f32x4(v_acc);
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __m128
|
||||
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
||||
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
|
||||
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||
@@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
||||
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
|
||||
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
@@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
||||
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
||||
|
||||
return __lsx_vld(tmp, 0);
|
||||
return (__m128)__lsx_vld(tmp, 0);
|
||||
}
|
||||
|
||||
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
@@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
||||
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
|
||||
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
|
||||
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
||||
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
||||
|
||||
@@ -54,7 +54,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
|
||||
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/mean.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/topk-moe.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
#include "ggml-cuda/wkv.cuh"
|
||||
@@ -2825,6 +2826,44 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
GGML_ASSERT(unary_ops.size() == num_unary);
|
||||
#endif
|
||||
|
||||
//TODO: remove special case once ggml_can_fuse can handle empty nodes
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops = ggml_cuda_topk_moe_ops(false);
|
||||
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);
|
||||
|
||||
if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
|
||||
|
||||
if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
|
||||
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
|
||||
}
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx+8];
|
||||
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
|
||||
|
||||
if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < topk_moe_ops.size(); i++) {
|
||||
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
|
||||
}
|
||||
|
||||
ggml_tensor * softmax = cgraph->nodes[node_idx];
|
||||
ggml_tensor * weights = cgraph->nodes[node_idx+4];
|
||||
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
@@ -2915,6 +2954,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion) {
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
|
||||
ggml_tensor * weights = cgraph->nodes[i+8];
|
||||
ggml_tensor * selected_experts = cgraph->nodes[i+3];
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true);
|
||||
i += 8;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
|
||||
ggml_tensor * weights = cgraph->nodes[i+4];
|
||||
ggml_tensor * selected_experts = cgraph->nodes[i+3];
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false);
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_ADD) {
|
||||
int n_fuse = 0;
|
||||
ggml_op ops[8];
|
||||
|
||||
@@ -81,7 +81,7 @@ static __global__ void mmq_ids_helper(
|
||||
#pragma unroll
|
||||
for (int offset = neu_padded; offset < warp_size; offset += neu_padded) {
|
||||
const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size);
|
||||
if (threadIdx.x >= offset) {
|
||||
if (threadIdx.x >= static_cast<unsigned int>(offset)) {
|
||||
it_compact_add_lower += tmp;
|
||||
}
|
||||
}
|
||||
@@ -110,7 +110,7 @@ static __global__ void mmq_ids_helper(
|
||||
|
||||
expert_bounds[expert] = nex_prev;
|
||||
|
||||
if (expert < gridDim.x - 1) {
|
||||
if (expert < static_cast<int>(gridDim.x) - 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -220,7 +220,7 @@ static __global__ void mul_mat_vec_q(
|
||||
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
|
||||
}
|
||||
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) {
|
||||
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
|
||||
dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,6 +51,8 @@ static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
|
||||
}
|
||||
const float value = *(const float *) (src0_ptr + src_idx * nb00);
|
||||
*(float *) (dst_ptr + i0 * nb0) = value;
|
||||
|
||||
GGML_UNUSED(p1);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -0,0 +1,259 @@
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml.h"
|
||||
#include "topk-moe.cuh"
|
||||
|
||||
#include <initializer_list>
|
||||
|
||||
/*
|
||||
This kernel does the following:
|
||||
1. softmax over the logits per token [n_experts, n_tokens]
|
||||
2. argmax reduce over the top-k (n_experts_used) logits
|
||||
3. write weights + ids to global memory
|
||||
4. optionally normalize the weights
|
||||
|
||||
It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
|
||||
*/
|
||||
template <size_t n_experts, bool with_norm>
|
||||
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
|
||||
float * weights,
|
||||
int32_t * ids,
|
||||
const int n_rows,
|
||||
const int n_expert_used) {
|
||||
const int row = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
if (row >= n_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
logits += n_experts * row;
|
||||
weights += n_expert_used * row;
|
||||
ids += n_experts * row;
|
||||
|
||||
constexpr int experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
|
||||
|
||||
float logits_r[experts_per_thread];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
const int expert = i + threadIdx.x;
|
||||
logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[expert] : -INFINITY;
|
||||
}
|
||||
|
||||
float max_val = logits_r[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < experts_per_thread; i++) {
|
||||
const float val = logits_r[i];
|
||||
max_val = max(val, max_val);
|
||||
}
|
||||
|
||||
max_val = warp_reduce_max(max_val);
|
||||
|
||||
float wt[experts_per_thread];
|
||||
float tmp = 0.f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
const float val = logits_r[i];
|
||||
wt[i] = expf(val - max_val);
|
||||
tmp += wt[i];
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
|
||||
const float inv_sum = 1.0f / tmp;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
wt[i] = wt[i] * inv_sum;
|
||||
}
|
||||
|
||||
//at this point, each thread holds a portion of softmax,
|
||||
//we do the argmax reduce over n_expert_used, each time marking
|
||||
//the expert weight as -inf to exclude from the next iteration
|
||||
|
||||
float wt_sum = 0.f;
|
||||
|
||||
extern __shared__ float data_topk_shared[];
|
||||
float * wt_shared_ptr = data_topk_shared + threadIdx.y * n_expert_used;
|
||||
|
||||
for (int k = 0; k < n_expert_used; k++) {
|
||||
float max_val = wt[0];
|
||||
int max_expert = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < experts_per_thread; i++) {
|
||||
const int expert = threadIdx.x + i * WARP_SIZE;
|
||||
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
|
||||
max_val = wt[i];
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
|
||||
const float val = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
|
||||
const int expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
|
||||
if (val > max_val || (val == max_val && expert < max_expert)) {
|
||||
max_val = val;
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
|
||||
if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
|
||||
wt[max_expert / WARP_SIZE] = -INFINITY;
|
||||
|
||||
wt_shared_ptr[k] = max_val;
|
||||
ids[k] = max_expert;
|
||||
if constexpr (with_norm) {
|
||||
wt_sum += max_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (with_norm) {
|
||||
wt_sum = warp_reduce_sum(wt_sum);
|
||||
const float inv_sum = 1.0f / wt_sum;
|
||||
|
||||
for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
|
||||
wt_shared_ptr[i] = wt_shared_ptr[i] * inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
|
||||
weights[i] = wt_shared_ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <bool with_norm>
|
||||
static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
|
||||
const float * logits,
|
||||
float * weights,
|
||||
int32_t * ids,
|
||||
const int n_rows,
|
||||
const int n_expert,
|
||||
const int n_expert_used) {
|
||||
const int rows_per_block = 4;
|
||||
dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
|
||||
dim3 block_dims(WARP_SIZE, rows_per_block, 1);
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int nbytes_shared = n_expert_used * rows_per_block * sizeof(float);
|
||||
|
||||
switch (n_expert) {
|
||||
case 1:
|
||||
topk_moe_cuda<1, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 2:
|
||||
topk_moe_cuda<2, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 4:
|
||||
topk_moe_cuda<4, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 8:
|
||||
topk_moe_cuda<8, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 16:
|
||||
topk_moe_cuda<16, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 32:
|
||||
topk_moe_cuda<32, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 64:
|
||||
topk_moe_cuda<64, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 128:
|
||||
topk_moe_cuda<128, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 256:
|
||||
topk_moe_cuda<256, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
case 512:
|
||||
topk_moe_cuda<512, with_norm>
|
||||
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * logits,
|
||||
ggml_tensor * weights,
|
||||
ggml_tensor * ids,
|
||||
const bool with_norm) {
|
||||
GGML_ASSERT(logits->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(weights->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
|
||||
const int n_experts = logits->ne[0];
|
||||
const int n_rows = logits->ne[1];
|
||||
|
||||
const float * logits_d = (const float *) logits->src[0]->data;
|
||||
float * weights_d = (float *) weights->data;
|
||||
int32_t * ids_d = (int32_t *) ids->data;
|
||||
|
||||
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int n_expert_used = weights->ne[1];
|
||||
|
||||
if (with_norm) {
|
||||
launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
|
||||
} else {
|
||||
launch_topk_moe_cuda<false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights) {
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
|
||||
|
||||
if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (scale != 1.0f || max_bias != 0.0f) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// don't fuse when masks or sinks are present
|
||||
if (softmax->src[1] || softmax->src[2]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int n_expert = softmax->ne[0];
|
||||
// n_expert must be a power of 2
|
||||
if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm) {
|
||||
static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
GGML_OP_SUM_ROWS, GGML_OP_DIV, GGML_OP_RESHAPE };
|
||||
|
||||
static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
|
||||
GGML_OP_VIEW, GGML_OP_GET_ROWS };
|
||||
|
||||
if (norm) {
|
||||
return norm_ops;
|
||||
}
|
||||
return no_norm_ops;
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
#include "common.cuh"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <initializer_list>
|
||||
|
||||
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * logits,
|
||||
ggml_tensor * weights,
|
||||
ggml_tensor * top_k,
|
||||
const bool with_norm);
|
||||
|
||||
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights);
|
||||
|
||||
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm);
|
||||
@@ -256,8 +256,6 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
|
||||
|
||||
// perform reorders only across these types of ops
|
||||
// can be expanded when needed
|
||||
// IMPORTANT: do not add ops such as GGML_OP_CPY or GGML_OP_SET_ROWS
|
||||
// the dependencies from such ops are not always represented in the graph
|
||||
const auto & h_safe = [](ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
@@ -273,6 +271,8 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
|
||||
case GGML_OP_GLU:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
return true;
|
||||
default:
|
||||
return ggml_op_is_empty(op);
|
||||
@@ -383,6 +383,7 @@ void ggml_graph_optimize(ggml_cgraph * gf) {
|
||||
// fuse only ops that start with these operations
|
||||
// can be expanded when needed
|
||||
if (node.op() == GGML_OP_ADD ||
|
||||
node.op() == GGML_OP_NORM ||
|
||||
node.op() == GGML_OP_RMS_NORM) {
|
||||
ops[0] = node.op();
|
||||
|
||||
@@ -392,6 +393,7 @@ void ggml_graph_optimize(ggml_cgraph * gf) {
|
||||
// can be expanded when needed
|
||||
if (gf->nodes[f]->op != GGML_OP_ADD &&
|
||||
gf->nodes[f]->op != GGML_OP_MUL &&
|
||||
gf->nodes[f]->op != GGML_OP_NORM &&
|
||||
gf->nodes[f]->op != GGML_OP_RMS_NORM) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1090,36 +1090,6 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin(
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rms_norm(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {
|
||||
assert(op->op == GGML_OP_RMS_NORM);
|
||||
|
||||
GGML_ASSERT(op->src[0]->ne[0] % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
switch (n_fuse) {
|
||||
case 1: snprintf(base, 256, "kernel_rms_norm_f32"); break;
|
||||
case 2: snprintf(base, 256, "kernel_rms_norm_mul_f32"); break;
|
||||
case 3: snprintf(base, 256, "kernel_rms_norm_mul_add_f32"); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
|
||||
ggml_metal_pipeline_set_smem(res, 32*sizeof(float));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_L2_NORM);
|
||||
|
||||
@@ -1167,16 +1137,37 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm(ggml_metal_libr
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_NORM);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm(ggml_metal_library_t lib, const ggml_tensor * op, int n_fuse) {
|
||||
assert(op->op == GGML_OP_NORM || op->op == GGML_OP_RMS_NORM);
|
||||
|
||||
GGML_ASSERT(op->src[0]->ne[0] % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous_1(op->src[0]));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_norm_f32");
|
||||
const char * suffix = "";
|
||||
if (op->ne[0] % 4 == 0) {
|
||||
suffix = "_4";
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_NORM:
|
||||
switch (n_fuse) {
|
||||
case 1: snprintf(base, 256, "kernel_norm_f32%s", suffix); break;
|
||||
case 2: snprintf(base, 256, "kernel_norm_mul_f32%s", suffix); break;
|
||||
case 3: snprintf(base, 256, "kernel_norm_mul_add_f32%s", suffix); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
switch (n_fuse) {
|
||||
case 1: snprintf(base, 256, "kernel_rms_norm_f32%s", suffix); break;
|
||||
case 2: snprintf(base, 256, "kernel_rms_norm_mul_f32%s", suffix); break;
|
||||
case 3: snprintf(base, 256, "kernel_rms_norm_mul_add_f32%s", suffix); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
} break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
||||
@@ -1237,7 +1228,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col(ggml_metal_library_
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
|
||||
snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
||||
|
||||
@@ -123,10 +123,9 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id (ggml_me
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argmax (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rms_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
||||
@@ -661,13 +661,13 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
|
||||
case GGML_OP_ARGMAX:
|
||||
return has_simdgroup_reduction;
|
||||
case GGML_OP_NORM:
|
||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
|
||||
case GGML_OP_RMS_NORM:
|
||||
return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
|
||||
case GGML_OP_ROPE:
|
||||
return true;
|
||||
case GGML_OP_IM2COL:
|
||||
|
||||
@@ -428,16 +428,11 @@ typedef struct {
|
||||
uint64_t nb1;
|
||||
} ggml_metal_kargs_mul_mv_id;
|
||||
|
||||
// NORM
|
||||
// RMS_NORM
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne00_4;
|
||||
uint64_t nb01;
|
||||
float eps;
|
||||
} ggml_metal_kargs_norm;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne00_4;
|
||||
int32_t ne00_t;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
@@ -448,7 +443,7 @@ typedef struct {
|
||||
uint64_t nbf1[3];
|
||||
uint64_t nbf2[3];
|
||||
uint64_t nbf3[3];
|
||||
} ggml_metal_kargs_rms_norm;
|
||||
} ggml_metal_kargs_norm;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
|
||||
@@ -266,10 +266,6 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
{
|
||||
n_fuse = ggml_metal_op_set_rows(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
n_fuse = ggml_metal_op_rms_norm(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_L2_NORM:
|
||||
{
|
||||
n_fuse = ggml_metal_op_l2_norm(ctx, idx);
|
||||
@@ -279,6 +275,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
n_fuse = ggml_metal_op_group_norm(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
n_fuse = ggml_metal_op_norm(ctx, idx);
|
||||
} break;
|
||||
@@ -2346,146 +2343,6 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
|
||||
return n_fuse;
|
||||
}
|
||||
|
||||
int ggml_metal_op_rms_norm(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_cgraph * gf = ctx->gf;
|
||||
ggml_tensor * op = ggml_graph_node(gf, idx);
|
||||
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
const int idx_end = ctx->idx_end;
|
||||
|
||||
const bool use_fusion = ctx->use_fusion;
|
||||
|
||||
const int debug_fusion = ctx->debug_fusion;
|
||||
|
||||
ggml_tensor ** ops = ggml_graph_nodes(gf) + idx;
|
||||
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||
GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, op->op_params, sizeof(float));
|
||||
|
||||
ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
|
||||
ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
|
||||
|
||||
ggml_metal_kargs_rms_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne00_4 =*/ ne00/4,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.eps =*/ eps,
|
||||
/*.nef1 =*/ { ne01 },
|
||||
/*.nef2 =*/ { ne02 },
|
||||
/*.nef3 =*/ { ne03 },
|
||||
/*.nbf1 =*/ { nb01 },
|
||||
/*.nbf2 =*/ { nb02 },
|
||||
/*.nbf3 =*/ { nb03 },
|
||||
};
|
||||
|
||||
ggml_op fops[8];
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
|
||||
|
||||
// d[0] = rms_norm(a)
|
||||
// d[1] = mul(d[0], b)
|
||||
// d[2] = add(d[1], c)
|
||||
if (use_fusion) {
|
||||
fops[0] = GGML_OP_RMS_NORM;
|
||||
fops[1] = GGML_OP_MUL;
|
||||
fops[2] = GGML_OP_ADD;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, fops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse] != ops[n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->src[1]->ne[0] != op->ne[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous_rows(ops[n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->type != GGML_TYPE_F32) {
|
||||
break;
|
||||
}
|
||||
|
||||
//ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
|
||||
|
||||
bid_fuse[n_fuse] = ggml_metal_get_buffer_id(ops[n_fuse + 1]->src[1]);
|
||||
|
||||
args.nef1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[1];
|
||||
args.nef2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[2];
|
||||
args.nef3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[3];
|
||||
|
||||
args.nbf1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[1];
|
||||
args.nbf2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[2];
|
||||
args.nbf3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[3];
|
||||
}
|
||||
|
||||
++n_fuse;
|
||||
|
||||
if (debug_fusion > 1 && n_fuse > 1) {
|
||||
if (n_fuse == 2) {
|
||||
GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL\n", __func__);
|
||||
}
|
||||
if (n_fuse == 3) {
|
||||
GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL + ADD\n", __func__);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_fuse > 1) {
|
||||
bid_dst = ggml_metal_get_buffer_id(ops[n_fuse - 1]);
|
||||
|
||||
for (int i = 1; i < n_fuse; ++i) {
|
||||
if (!ggml_metal_op_concurrency_check(ctx, ops[i])) {
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rms_norm(lib, op, n_fuse);
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
|
||||
while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
nth = std::min(nth, ne00/4);
|
||||
|
||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[0], 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[1], 3);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 4);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
|
||||
|
||||
return n_fuse;
|
||||
}
|
||||
|
||||
int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_cgraph * gf = ctx->gf;
|
||||
ggml_tensor * op = ggml_graph_node(gf, idx);
|
||||
@@ -2594,6 +2451,14 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
const int idx_end = ctx->idx_end;
|
||||
|
||||
const bool use_fusion = ctx->use_fusion;
|
||||
|
||||
const int debug_fusion = ctx->debug_fusion;
|
||||
|
||||
ggml_tensor ** ops = ggml_graph_nodes(gf) + idx;
|
||||
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||
@@ -2602,37 +2467,121 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
|
||||
float eps;
|
||||
memcpy(&eps, op->op_params, sizeof(float));
|
||||
|
||||
ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
|
||||
ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
|
||||
|
||||
ggml_metal_kargs_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne00_4 =*/ ne00/4,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.ne00_t =*/ ne00 % 4 == 0 ? ne00/4 : ne00,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.eps =*/ eps,
|
||||
/*.nef1 =*/ { ne01 },
|
||||
/*.nef2 =*/ { ne02 },
|
||||
/*.nef3 =*/ { ne03 },
|
||||
/*.nbf1 =*/ { nb01 },
|
||||
/*.nbf2 =*/ { nb02 },
|
||||
/*.nbf3 =*/ { nb03 },
|
||||
};
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op);
|
||||
ggml_op fops[8];
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
|
||||
|
||||
// d[0] = norm(a)
|
||||
// d[1] = mul(d[0], b)
|
||||
// d[2] = add(d[1], c)
|
||||
if (use_fusion) {
|
||||
fops[0] = op->op;
|
||||
fops[1] = GGML_OP_MUL;
|
||||
fops[2] = GGML_OP_ADD;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, fops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse] != ops[n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->src[1]->ne[0] != op->ne[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous_rows(ops[n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->type != GGML_TYPE_F32) {
|
||||
break;
|
||||
}
|
||||
|
||||
//ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
|
||||
|
||||
bid_fuse[n_fuse] = ggml_metal_get_buffer_id(ops[n_fuse + 1]->src[1]);
|
||||
|
||||
args.nef1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[1];
|
||||
args.nef2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[2];
|
||||
args.nef3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[3];
|
||||
|
||||
args.nbf1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[1];
|
||||
args.nbf2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[2];
|
||||
args.nbf3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[3];
|
||||
}
|
||||
|
||||
++n_fuse;
|
||||
|
||||
if (debug_fusion > 1 && n_fuse > 1) {
|
||||
if (n_fuse == 2) {
|
||||
GGML_LOG_DEBUG("%s: fuse: %s + MUL\n", __func__, ggml_op_name(op->op));
|
||||
}
|
||||
if (n_fuse == 3) {
|
||||
GGML_LOG_DEBUG("%s: fuse: %s + MUL + ADD\n", __func__, ggml_op_name(op->op));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_fuse > 1) {
|
||||
bid_dst = ggml_metal_get_buffer_id(ops[n_fuse - 1]);
|
||||
|
||||
for (int i = 1; i < n_fuse; ++i) {
|
||||
if (!ggml_metal_op_concurrency_check(ctx, ops[i])) {
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
|
||||
while (nth < args.ne00_t && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
nth = std::min(nth, ne00/4);
|
||||
nth = std::min(nth, args.ne00_t);
|
||||
|
||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
||||
|
||||
const int64_t nrows = ggml_nrows(op->src[0]);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[0], 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[1], 3);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 4);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
|
||||
|
||||
return 1;
|
||||
return n_fuse;
|
||||
}
|
||||
|
||||
int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
|
||||
@@ -2768,7 +2717,6 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
|
||||
const uint64_t ofs0 = op->src[1]->nb[is_2D ? 3 : 2] / 4;
|
||||
const uint64_t ofs1 = op->src[1]->nb[is_2D ? 2 : 1] / 4;
|
||||
|
||||
|
||||
ggml_metal_kargs_im2col args = {
|
||||
/*.ofs0 =*/ ofs0,
|
||||
/*.ofs1 =*/ ofs1,
|
||||
@@ -2789,15 +2737,16 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
|
||||
|
||||
const uint64_t n_threads = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), N);
|
||||
const int64_t quotient = N / n_threads + (N % n_threads > 0 ? 1 : 0);
|
||||
GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
|
||||
const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, quotient * CHW, OH, OW, n_threads, 1, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -60,7 +60,6 @@ int ggml_metal_op_mul_mat_id (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_add_id (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_flash_attn_ext (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_bin (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_rms_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_l2_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_group_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_norm (ggml_metal_op_t ctx, int idx);
|
||||
|
||||
@@ -66,6 +66,10 @@ static inline float e8m0_to_fp32(uint8_t x) {
|
||||
return as_type<float>(bits);
|
||||
}
|
||||
|
||||
static inline float dot(float x, float y) {
|
||||
return x*y;
|
||||
}
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
template <typename type4x4>
|
||||
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
||||
@@ -2493,30 +2497,43 @@ kernel void kernel_argmax_f32(
|
||||
dst_i32[tgpig] = arg_val;
|
||||
}
|
||||
|
||||
kernel void kernel_norm_f32(
|
||||
// F == 1 : norm (no fuse)
|
||||
// F == 2 : norm + mul
|
||||
// F == 3 : norm + mul + add
|
||||
template <typename T, short F>
|
||||
kernel void kernel_norm_fuse_impl(
|
||||
constant ggml_metal_kargs_norm & args,
|
||||
device const char * src0,
|
||||
device const char * src1_0,
|
||||
device const char * src1_1,
|
||||
device char * dst,
|
||||
threadgroup float * shmem_f32 [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort ntg[[threads_per_threadgroup]]) {
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
if (sgitg == 0) {
|
||||
shmem_f32[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
|
||||
const int i01 = tgpig.x;
|
||||
const int i02 = tgpig.y;
|
||||
const int i03 = tgpig.z;
|
||||
|
||||
float4 sumf4(0.0f);
|
||||
device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
|
||||
device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
|
||||
T sumft(0.0f);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
sumf4 += x[i00];
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
sumft += x[i00];
|
||||
}
|
||||
sumf = sumf4[0] + sumf4[1] + sumf4[2] + sumf4[3];
|
||||
sumf = dot(sumft, T(1.0f));
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
@@ -2532,10 +2549,10 @@ kernel void kernel_norm_f32(
|
||||
|
||||
const float mean = sumf/args.ne00;
|
||||
|
||||
device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
|
||||
device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
|
||||
sumf = 0.0f;
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
y[i00] = x[i00] - mean;
|
||||
sumf += dot(y[i00], y[i00]);
|
||||
}
|
||||
@@ -2555,17 +2572,35 @@ kernel void kernel_norm_f32(
|
||||
const float variance = sumf/args.ne00;
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + args.eps);
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
y[i00] = y[i00] * scale;
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
if (F == 1) {
|
||||
y[i00] = (y[i00]*scale);
|
||||
}
|
||||
if (F == 2) {
|
||||
y[i00] = (y[i00]*scale)*f0[i00];
|
||||
}
|
||||
if (F == 3) {
|
||||
y[i00] = (y[i00]*scale)*f0[i00] + f1[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_norm_fuse_impl<float4, 1>) kernel_norm_fuse_t;
|
||||
|
||||
template [[host_name("kernel_norm_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 1>;
|
||||
template [[host_name("kernel_norm_mul_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 2>;
|
||||
template [[host_name("kernel_norm_mul_add_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 3>;
|
||||
|
||||
template [[host_name("kernel_norm_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 1>;
|
||||
template [[host_name("kernel_norm_mul_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 2>;
|
||||
template [[host_name("kernel_norm_mul_add_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 3>;
|
||||
|
||||
// F == 1 : rms_norm (no fuse)
|
||||
// F == 2 : rms_norm + mul
|
||||
// F == 3 : rms_norm + mul + add
|
||||
template <short F>
|
||||
template <typename T, short F>
|
||||
kernel void kernel_rms_norm_fuse_impl(
|
||||
constant ggml_metal_kargs_rms_norm & args,
|
||||
constant ggml_metal_kargs_norm & args,
|
||||
device const char * src0,
|
||||
device const char * src1_0,
|
||||
device const char * src1_1,
|
||||
@@ -2584,15 +2619,15 @@ kernel void kernel_rms_norm_fuse_impl(
|
||||
const int i02 = tgpig.y;
|
||||
const int i03 = tgpig.z;
|
||||
|
||||
device const float4 * x = (device const float4 *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
|
||||
device const float4 * f0 = (device const float4 *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const float4 * f1 = (device const float4 *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
sumf += dot(x[i00], x[i00]);
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
@@ -2611,8 +2646,8 @@ kernel void kernel_rms_norm_fuse_impl(
|
||||
const float mean = sumf/args.ne00;
|
||||
const float scale = 1.0f/sqrt(mean + args.eps);
|
||||
|
||||
device float4 * y = (device float4 *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
|
||||
device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
if (F == 1) {
|
||||
y[i00] = (x[i00]*scale);
|
||||
}
|
||||
@@ -2625,11 +2660,15 @@ kernel void kernel_rms_norm_fuse_impl(
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_rms_norm_fuse_impl<1>) kernel_rms_norm_fuse_t;
|
||||
typedef decltype(kernel_rms_norm_fuse_impl<float4, 1>) kernel_rms_norm_fuse_t;
|
||||
|
||||
template [[host_name("kernel_rms_norm_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<3>;
|
||||
template [[host_name("kernel_rms_norm_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 3>;
|
||||
|
||||
template [[host_name("kernel_rms_norm_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 3>;
|
||||
|
||||
kernel void kernel_l2_norm_f32(
|
||||
constant ggml_metal_kargs_l2_norm & args,
|
||||
@@ -3987,60 +4026,7 @@ template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kerne
|
||||
template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
|
||||
template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
|
||||
|
||||
// TODO: obolete -- remove
|
||||
//typedef void (im2col_t)(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]],
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]);
|
||||
//
|
||||
//template <typename T>
|
||||
//kernel void kernel_im2col(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]],
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]) {
|
||||
//// const int64_t IC = tgpg[0];
|
||||
// const int64_t OH = tgpg[1];
|
||||
// const int64_t OW = tgpg[2];
|
||||
//
|
||||
//// const int64_t N = ntg[0];
|
||||
// const int64_t KH = ntg[1];
|
||||
// const int64_t KW = ntg[2];
|
||||
//
|
||||
// const int64_t in = tpitg[0];
|
||||
// const int64_t ikh = tpitg[1];
|
||||
// const int64_t ikw = tpitg[2];
|
||||
//
|
||||
// const int64_t iic = tgpig[0];
|
||||
// const int64_t ioh = tgpig[1];
|
||||
// const int64_t iow = tgpig[2];
|
||||
//
|
||||
// const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0;
|
||||
// const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1;
|
||||
//
|
||||
// const int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw);
|
||||
//
|
||||
// device T * pdst = (device T *) (dst);
|
||||
//
|
||||
// if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
// pdst[offset_dst] = 0.0f;
|
||||
// } else {
|
||||
// const int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw;
|
||||
// pdst[offset_dst] = x[offset_src];
|
||||
// }
|
||||
//}
|
||||
//
|
||||
//template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
//template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
typedef void (im2col_ext_t)(
|
||||
typedef void (im2col_t)(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
@@ -4050,48 +4036,113 @@ typedef void (im2col_ext_t)(
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_im2col_ext(
|
||||
kernel void kernel_im2col(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1]
|
||||
const int64_t KHW = (int64_t)args.KHW;
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
// const int64_t IC = tgpg[0];
|
||||
const int64_t OH = tgpg[1];
|
||||
const int64_t OW = tgpg[2];
|
||||
|
||||
const int64_t d = tgpig[0] / args.CHW;
|
||||
const int64_t chw = tgpig[0] % args.CHW;
|
||||
const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1)
|
||||
const int64_t HW = tgpig[0] % KHW;
|
||||
const int64_t KH = ntg[1];
|
||||
const int64_t KW = ntg[2];
|
||||
|
||||
const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
|
||||
if (tpitg_0 >= args.N) {
|
||||
return;
|
||||
}
|
||||
int64_t in = tpitg[0];
|
||||
const int64_t ikh = tpitg[1];
|
||||
const int64_t ikw = tpitg[2];
|
||||
|
||||
const int64_t tpitg_1 = HW / args.KW;
|
||||
const int64_t tpitg_2 = HW % args.KW;
|
||||
const int64_t iic = tgpig[0];
|
||||
const int64_t ioh = tgpig[1];
|
||||
const int64_t iow = tgpig[2];
|
||||
|
||||
const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
|
||||
const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
|
||||
const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0;
|
||||
const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
(tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
|
||||
(tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
|
||||
int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw);
|
||||
|
||||
device T * pdst = (device T *) (dst);
|
||||
|
||||
if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
pdst[offset_dst] = 0.0f;
|
||||
while (in < args.N) {
|
||||
pdst[offset_dst] = 0.0f;
|
||||
offset_dst += ntg[0]*args.CHW*OH*OW;
|
||||
|
||||
in += ntg[0];
|
||||
}
|
||||
} else {
|
||||
const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
|
||||
pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
|
||||
int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw;
|
||||
|
||||
while (in < args.N) {
|
||||
pdst[offset_dst] = x[offset_src];
|
||||
|
||||
offset_dst += ntg[0]*args.CHW*OH*OW;
|
||||
offset_src += ntg[0]*args.ofs0;
|
||||
|
||||
in += ntg[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
|
||||
template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
|
||||
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
// TODO: obolete -- remove
|
||||
//typedef void (im2col_ext_t)(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]],
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]);
|
||||
//
|
||||
//template <typename T>
|
||||
//kernel void kernel_im2col_ext(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1]
|
||||
// const int64_t KHW = (int64_t)args.KHW;
|
||||
//
|
||||
// const int64_t d = tgpig[0] / args.CHW;
|
||||
// const int64_t chw = tgpig[0] % args.CHW;
|
||||
// const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1)
|
||||
// const int64_t HW = tgpig[0] % KHW;
|
||||
//
|
||||
// const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
|
||||
// if (tpitg_0 >= args.N) {
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// const int64_t tpitg_1 = HW / args.KW;
|
||||
// const int64_t tpitg_2 = HW % args.KW;
|
||||
//
|
||||
// const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
|
||||
// const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
|
||||
//
|
||||
// const int64_t offset_dst =
|
||||
// (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
|
||||
// (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
|
||||
//
|
||||
// device T * pdst = (device T *) (dst);
|
||||
//
|
||||
// if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
// pdst[offset_dst] = 0.0f;
|
||||
// } else {
|
||||
// const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
|
||||
// pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
|
||||
// }
|
||||
//}
|
||||
//
|
||||
//template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
|
||||
//template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
|
||||
|
||||
typedef void (conv_transpose_1d_t)(
|
||||
constant ggml_metal_kargs_conv_transpose_1d & args,
|
||||
|
||||
@@ -31,6 +31,12 @@
|
||||
#include <filesystem>
|
||||
#include <algorithm>
|
||||
|
||||
static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
|
||||
|
||||
#define LOG_DBG(...) \
|
||||
do { if (RPC_DEBUG) GGML_LOG_DEBUG(__VA_ARGS__); } while (0)
|
||||
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
|
||||
@@ -47,7 +53,7 @@ struct socket_t {
|
||||
sockfd_t fd;
|
||||
socket_t(sockfd_t fd) : fd(fd) {}
|
||||
~socket_t() {
|
||||
GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
|
||||
LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
|
||||
#ifdef _WIN32
|
||||
closesocket(this->fd);
|
||||
#else
|
||||
@@ -265,14 +271,14 @@ static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_no_delay(sockfd)) {
|
||||
fprintf(stderr, "Failed to set TCP_NODELAY\n");
|
||||
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
||||
return nullptr;
|
||||
}
|
||||
addr.sin_family = AF_INET;
|
||||
addr.sin_port = htons(port);
|
||||
struct hostent * server = gethostbyname(host);
|
||||
if (server == NULL) {
|
||||
fprintf(stderr, "Cannot resolve host '%s'\n", host);
|
||||
GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
|
||||
return nullptr;
|
||||
}
|
||||
memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
|
||||
@@ -289,7 +295,7 @@ static std::shared_ptr<socket_t> socket_accept(sockfd_t srv_sockfd) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_no_delay(client_socket_fd)) {
|
||||
fprintf(stderr, "Failed to set TCP_NODELAY\n");
|
||||
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
||||
return nullptr;
|
||||
}
|
||||
return client_socket;
|
||||
@@ -302,11 +308,11 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_reuse_addr(sockfd)) {
|
||||
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
||||
GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (inet_addr(host) == INADDR_NONE) {
|
||||
fprintf(stderr, "Invalid host address: %s\n", host);
|
||||
GGML_LOG_ERROR("Invalid host address: %s\n", host);
|
||||
return nullptr;
|
||||
}
|
||||
struct sockaddr_in serv_addr;
|
||||
@@ -349,7 +355,7 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||
return false;
|
||||
}
|
||||
if (n == 0) {
|
||||
GGML_LOG_ERROR("recv returned 0 (peer closed?)\n");
|
||||
LOG_DBG("recv returned 0 (peer closed?)\n");
|
||||
return false;
|
||||
}
|
||||
bytes_recv += (size_t)n;
|
||||
@@ -383,7 +389,7 @@ static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
|
||||
try {
|
||||
input.resize(size);
|
||||
} catch (const std::bad_alloc & e) {
|
||||
fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", size);
|
||||
GGML_LOG_ERROR("Failed to allocate input buffer of size %" PRIu64 "\n", size);
|
||||
return false;
|
||||
}
|
||||
return recv_data(sockfd, input.data(), size);
|
||||
@@ -443,11 +449,11 @@ static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
||||
fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
return false;
|
||||
}
|
||||
if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
|
||||
fprintf(stderr, "WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
GGML_LOG_INFO("WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -488,7 +494,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
if (!check_server_version(sock)) {
|
||||
return nullptr;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
LOG_DBG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
sockets[endpoint] = sock;
|
||||
return sock;
|
||||
}
|
||||
@@ -809,7 +815,7 @@ ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||
}
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
fprintf(stderr, "Failed to connect to %s\n", endpoint);
|
||||
GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
|
||||
return nullptr;
|
||||
}
|
||||
size_t alignment = get_alignment(sock);
|
||||
@@ -909,7 +915,7 @@ void rpc_server::hello(rpc_msg_hello_rsp & response) {
|
||||
response.major = RPC_PROTO_MAJOR_VERSION;
|
||||
response.minor = RPC_PROTO_MINOR_VERSION;
|
||||
response.patch = RPC_PROTO_PATCH_VERSION;
|
||||
GGML_PRINT_DEBUG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
|
||||
LOG_DBG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
|
||||
}
|
||||
|
||||
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
|
||||
@@ -929,7 +935,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
|
||||
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
|
||||
if (tensor->buffer == nullptr) {
|
||||
//No buffer allocated.
|
||||
buft = ggml_backend_get_default_buffer_type(backend);
|
||||
@@ -937,7 +943,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
|
||||
buft = tensor->buffer->buft;
|
||||
}
|
||||
|
||||
response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
|
||||
response.alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -950,29 +956,29 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
|
||||
if (buffer != nullptr) {
|
||||
response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
|
||||
response.remote_size = buffer->size;
|
||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
||||
LOG_DBG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
||||
buffers.insert(buffer);
|
||||
} else {
|
||||
GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||
LOG_DBG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||
}
|
||||
}
|
||||
|
||||
void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||
GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
|
||||
LOG_DBG("[%s] alignment: %lu\n", __func__, alignment);
|
||||
response.alignment = alignment;
|
||||
}
|
||||
|
||||
void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
||||
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
||||
GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
|
||||
LOG_DBG("[%s] max_size: %lu\n", __func__, max_size);
|
||||
response.max_size = max_size;
|
||||
}
|
||||
|
||||
bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
@@ -984,7 +990,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
|
||||
}
|
||||
|
||||
bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
@@ -996,7 +1002,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
||||
}
|
||||
|
||||
bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
||||
LOG_DBG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
@@ -1073,7 +1079,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
@@ -1096,7 +1102,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
fs::path cache_file = fs::path(cache_dir) / hash_str;
|
||||
std::ofstream ofs(cache_file, std::ios::binary);
|
||||
ofs.write((const char *)data, size);
|
||||
printf("[%s] saved to '%s'\n", __func__, cache_file.c_str());
|
||||
GGML_LOG_INFO("[%s] saved to '%s'\n", __func__, cache_file.c_str());
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, data, offset, size);
|
||||
return true;
|
||||
@@ -1142,8 +1148,8 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
|
||||
__func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
|
||||
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
|
||||
__func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
@@ -1177,7 +1183,7 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
|
||||
GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
|
||||
// Call the backend's buffer_init_tensor function
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
if (buffer && buffer->iface.init_tensor) {
|
||||
@@ -1210,7 +1216,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
|
||||
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
@@ -1254,7 +1260,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||
uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
|
||||
|
||||
if (dst_data + src_size > dst_base + dst_buf_sz) {
|
||||
GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
|
||||
GGML_LOG_ERROR("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
|
||||
" write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
|
||||
" buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
|
||||
__func__,
|
||||
@@ -1265,8 +1271,8 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
|
||||
__func__, (void*) src->buffer, (void*) dst->buffer);
|
||||
LOG_DBG("[%s] src->buffer: %p, dst->buffer: %p\n",
|
||||
__func__, (void*) src->buffer, (void*) dst->buffer);
|
||||
|
||||
response.result = ggml_backend_buffer_copy_tensor(src, dst);
|
||||
return true;
|
||||
@@ -1342,7 +1348,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
|
||||
return false;
|
||||
}
|
||||
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
||||
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||
LOG_DBG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||
|
||||
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||
|
||||
@@ -1394,7 +1400,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
}
|
||||
// the first command sent by the client must be HELLO
|
||||
if (cmd != RPC_CMD_HELLO) {
|
||||
fprintf(stderr, "Expected HELLO command, update client\n");
|
||||
GGML_LOG_ERROR("Expected HELLO command, update client\n");
|
||||
return;
|
||||
}
|
||||
if (!recv_msg(sockfd, nullptr, 0)) {
|
||||
@@ -1411,7 +1417,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
}
|
||||
if (cmd >= RPC_CMD_COUNT) {
|
||||
// fail fast if the command is invalid
|
||||
fprintf(stderr, "Unknown command: %d\n", cmd);
|
||||
GGML_LOG_ERROR("Unknown command: %d\n", cmd);
|
||||
break;
|
||||
}
|
||||
switch (cmd) {
|
||||
@@ -1599,7 +1605,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
fprintf(stderr, "Unknown command: %d\n", cmd);
|
||||
GGML_LOG_ERROR("Unknown command: %d\n", cmd);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,6 +96,7 @@ class Keys:
|
||||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
|
||||
EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
|
||||
EXPERT_CHUNK_FEED_FORWARD_LENGTH = "{arch}.expert_chunk_feed_forward_length"
|
||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
@@ -104,6 +105,8 @@ class Keys:
|
||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
||||
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
||||
EXPERT_GROUP_SCALE = "{arch}.expert_group_scale"
|
||||
EXPERTS_PER_GROUP = "{arch}.experts_per_group"
|
||||
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
|
||||
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
@@ -401,6 +404,7 @@ class MODEL_ARCH(IntEnum):
|
||||
LLADA = auto()
|
||||
LLADA_MOE = auto()
|
||||
SEED_OSS = auto()
|
||||
GROVEMOE = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
@@ -450,6 +454,9 @@ class MODEL_TENSOR(IntEnum):
|
||||
FFN_GATE_SHEXP = auto()
|
||||
FFN_DOWN_SHEXP = auto()
|
||||
FFN_UP_SHEXP = auto()
|
||||
FFN_GATE_CHEXP = auto()
|
||||
FFN_DOWN_CHEXP = auto()
|
||||
FFN_UP_CHEXP = auto()
|
||||
FFN_EXP_PROBS_B = auto()
|
||||
ATTN_Q_NORM = auto()
|
||||
ATTN_K_NORM = auto()
|
||||
@@ -738,6 +745,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLADA: "llada",
|
||||
MODEL_ARCH.LLADA_MOE: "llada-moe",
|
||||
MODEL_ARCH.SEED_OSS: "seed_oss",
|
||||
MODEL_ARCH.GROVEMOE: "grovemoe",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
@@ -784,6 +792,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||
MODEL_TENSOR.FFN_GATE_CHEXP: "blk.{bid}.ffn_gate_chexps",
|
||||
MODEL_TENSOR.FFN_DOWN_CHEXP: "blk.{bid}.ffn_down_chexps",
|
||||
MODEL_TENSOR.FFN_UP_CHEXP: "blk.{bid}.ffn_up_chexps",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
@@ -2712,6 +2723,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
],
|
||||
MODEL_ARCH.GROVEMOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_CHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_CHEXP,
|
||||
MODEL_TENSOR.FFN_UP_CHEXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -670,6 +670,9 @@ class GGUFWriter:
|
||||
def add_expert_shared_feed_forward_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_expert_chunk_feed_forward_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_CHUNK_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_parallel_residual(self, use: bool) -> None:
|
||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
|
||||
@@ -757,6 +760,12 @@ class GGUFWriter:
|
||||
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
|
||||
|
||||
def add_expert_group_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_experts_per_group(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERTS_PER_GROUP.format(arch=self.arch), count)
|
||||
|
||||
def add_moe_every_n_layers(self, value: int) -> None:
|
||||
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
|
||||
|
||||
|
||||
@@ -427,6 +427,10 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_CHEXP: (
|
||||
"model.layers.{bid}.mlp.chunk_experts.up_proj", # grovemoe
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
MODEL_TENSOR.FFN_ACT: (
|
||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
||||
@@ -468,6 +472,10 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_CHEXP: (
|
||||
"model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
MODEL_TENSOR.FFN_DOWN: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
@@ -524,6 +532,10 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
||||
"model.layers.{bid}.mlp.chunk_experts.down_proj", # grovemoe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
|
||||
@@ -98,6 +98,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLADA, "llada" },
|
||||
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
|
||||
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
||||
{ LLM_ARCH_GROVEMOE, "grovemoe" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
||||
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
|
||||
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
||||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
@@ -133,6 +135,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
|
||||
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
@@ -721,6 +725,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
@@ -2185,6 +2190,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GROVEMOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
|
||||
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2317,6 +2345,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// altup / laurel (gemma 3n)
|
||||
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
|
||||
@@ -102,6 +102,7 @@ enum llm_arch {
|
||||
LLM_ARCH_LLADA,
|
||||
LLM_ARCH_LLADA_MOE,
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_GROVEMOE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -129,6 +130,7 @@ enum llm_kv {
|
||||
LLM_KV_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_USE_PARALLEL_RESIDUAL,
|
||||
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
@@ -137,6 +139,8 @@ enum llm_kv {
|
||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||
LLM_KV_EXPERT_GATING_FUNC,
|
||||
LLM_KV_EXPERT_GROUP_SCALE,
|
||||
LLM_KV_EXPERTS_PER_GROUP,
|
||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
@@ -301,6 +305,9 @@ enum llm_tensor {
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_CHEXPS,
|
||||
LLM_TENSOR_FFN_GATE_CHEXPS,
|
||||
LLM_TENSOR_FFN_UP_CHEXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
|
||||
+40
-22
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
std::vector<int> target_pos(n_seqs_unq, -1);
|
||||
std::vector<int> target_row(n_seqs_unq, -1);
|
||||
|
||||
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
|
||||
const bool last = (
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
|
||||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
|
||||
);
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_pos pos = ubatch->pos[i];
|
||||
@@ -920,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
selection_probs = logits;
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_GROVEMOE) {
|
||||
selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
||||
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||
}
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
cb(selected_experts, "ffn_moe_topk", il);
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
||||
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
|
||||
// TODO: Use scalar div instead when/if implemented
|
||||
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
|
||||
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
|
||||
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
|
||||
} else {
|
||||
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
|
||||
}
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
cb(weights, "ffn_moe_weights", il);
|
||||
|
||||
|
||||
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
||||
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
||||
@@ -952,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(weights, "ffn_moe_weights_scaled", il);
|
||||
}
|
||||
|
||||
//call early so that topk-moe can be used
|
||||
ggml_build_forward_expand(gf, weights);
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
||||
|
||||
if (weight_before_ffn) {
|
||||
@@ -1177,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_cls() const {
|
||||
auto inp = std::make_unique<llm_graph_input_cls>(cparams);
|
||||
auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
|
||||
|
||||
auto & cur = inp->cls;
|
||||
|
||||
@@ -1877,34 +1897,32 @@ void llm_graph_context::build_pooling(
|
||||
case LLAMA_POOLING_TYPE_RANK:
|
||||
{
|
||||
ggml_tensor * inp_cls = build_inp_cls();
|
||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
if (cls) {
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
cur = ggml_mul_mat(ctx0, cls, inp);
|
||||
cur = ggml_mul_mat(ctx0, cls, cur);
|
||||
if (cls_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_b);
|
||||
}
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
}
|
||||
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
if (cls_out) {
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
}
|
||||
} else if (cls_out) {
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
if (cls_out) {
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
||||
}
|
||||
|
||||
// softmax for qwen3 reranker
|
||||
if (arch == LLM_ARCH_QWEN3) {
|
||||
cur = ggml_soft_max(ctx0, cur);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
|
||||
+2
-1
@@ -206,7 +206,7 @@ public:
|
||||
|
||||
class llm_graph_input_cls : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
|
||||
llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
|
||||
virtual ~llm_graph_input_cls() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
@@ -214,6 +214,7 @@ public:
|
||||
ggml_tensor * cls; // I32 [n_batch]
|
||||
|
||||
const llama_cparams cparams;
|
||||
const llm_arch arch;
|
||||
};
|
||||
|
||||
class llm_graph_input_rs : public llm_graph_input_i {
|
||||
|
||||
+4
-1
@@ -69,10 +69,13 @@ struct llama_hparams {
|
||||
uint32_t n_lora_kv = 0;
|
||||
uint32_t n_ff_exp = 0;
|
||||
uint32_t n_ff_shexp = 0;
|
||||
uint32_t n_ff_chexp = 0;
|
||||
uint32_t n_expert_shared = 0;
|
||||
uint32_t n_norm_groups = 0;
|
||||
uint32_t n_group_experts = 0;
|
||||
|
||||
float expert_weights_scale = 0.0;
|
||||
float expert_group_scale = 0.05f;
|
||||
float expert_weights_scale = 0.0f;
|
||||
bool expert_weights_norm = false;
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
uint32_t moe_every_n_layers = 0;
|
||||
|
||||
@@ -2009,6 +2009,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GROVEMOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
||||
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 48: type = LLM_TYPE_30B_A3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
@@ -3167,6 +3180,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
// output rerank head
|
||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
@@ -5837,6 +5853,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GROVEMOE:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
|
||||
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
|
||||
GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
|
||||
// MoE branch
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
|
||||
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
||||
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
||||
layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
|
||||
layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
|
||||
layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -6176,6 +6239,13 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_GROVEMOE) {
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
||||
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
||||
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
||||
}
|
||||
|
||||
vocab.print_info();
|
||||
}
|
||||
|
||||
@@ -18861,6 +18931,156 @@ struct llm_build_smallthinker : public llm_graph_context{
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_grovemoe : public llm_graph_context {
|
||||
llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self_attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// MoE branch
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
|
||||
cb(probs, "ffn_moe_logits", il);
|
||||
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(cur,
|
||||
nullptr,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il, probs);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
cur = moe_out;
|
||||
|
||||
// TODO: Only do the expert selection and weights once
|
||||
moe_out =
|
||||
build_moe_ffn(cur,
|
||||
nullptr,
|
||||
model.layers[il].ffn_up_chexps,
|
||||
model.layers[il].ffn_gate_chexps,
|
||||
model.layers[il].ffn_down_chexps,
|
||||
nullptr,
|
||||
n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il, probs);
|
||||
cb(moe_out, "ffn_adj_moe_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
|
||||
cb(cur, "ffn_final_moe_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||
llama_memory_i * res;
|
||||
|
||||
@@ -19387,6 +19607,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GROVEMOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_grovemoe>(*this, params);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -19592,6 +19816,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
case LLM_ARCH_SEED_OSS:
|
||||
case LLM_ARCH_GROVEMOE:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
|
||||
@@ -275,6 +275,11 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp = nullptr;
|
||||
|
||||
// ff adjugate experts (chexps)
|
||||
struct ggml_tensor * ffn_gate_chexps = nullptr;
|
||||
struct ggml_tensor * ffn_down_chexps = nullptr;
|
||||
struct ggml_tensor * ffn_up_chexps = nullptr;
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_gate_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||
|
||||
@@ -4418,6 +4418,49 @@ struct test_argsort : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
struct test_topk_moe: public test_case {
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int n_expert_used;
|
||||
const bool with_norm;
|
||||
test_topk_moe(std::array<int64_t, 4> ne = {10, 5, 1, 1}, int n_expert_used = 1, bool with_norm = false)
|
||||
: ne(ne), n_expert_used(n_expert_used), with_norm(with_norm) {
|
||||
GGML_ASSERT(n_expert_used <= ne[0]);
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(ne, n_expert_used, with_norm);
|
||||
}
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "TOPK_MOE";
|
||||
}
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
const int n_expert = ne[0];
|
||||
const int n_tokens = ne[1];
|
||||
|
||||
ggml_tensor * logits = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
|
||||
ggml_tensor * probs = ggml_soft_max(ctx, logits);
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
|
||||
ggml_tensor * out = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
||||
|
||||
if (with_norm) {
|
||||
out = ggml_reshape_2d(ctx, out, n_expert_used, n_tokens);
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx, out); // [1, n_tokens]
|
||||
|
||||
out = ggml_div(ctx, out, weights_sum); // [n_expert_used, n_tokens]
|
||||
out = ggml_reshape_3d(ctx, out, 1, n_expert_used, n_tokens);
|
||||
}
|
||||
|
||||
ggml_set_name(out, "out");
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SUM
|
||||
struct test_sum : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -6117,7 +6160,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
}
|
||||
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
@@ -6588,6 +6631,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
|
||||
test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3}));
|
||||
|
||||
for (bool with_norm : {false, true}) {
|
||||
test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
|
||||
test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));
|
||||
test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm));
|
||||
}
|
||||
|
||||
#if 0
|
||||
// these tests are disabled to save execution time, sbut they can be handy for debugging
|
||||
test_cases.emplace_back(new test_llama(2, true));
|
||||
|
||||
@@ -5093,21 +5093,15 @@ int main(int argc, char ** argv) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<server_tokens> tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true);
|
||||
if (tokenized_queries.size() != 1) {
|
||||
res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST));
|
||||
}
|
||||
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
|
||||
tasks.reserve(tokenized_docs.size());
|
||||
for (size_t i = 0; i < tokenized_docs.size(); i++) {
|
||||
auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]);
|
||||
tasks.reserve(documents.size());
|
||||
for (size_t i = 0; i < documents.size(); i++) {
|
||||
auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
|
||||
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.index = i;
|
||||
|
||||
@@ -64,3 +64,33 @@ cmake --build build -j --target llama-server && ./tools/server/tests/tests.sh
|
||||
```
|
||||
|
||||
To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
|
||||
|
||||
### Debugging external llama-server
|
||||
It can sometimes be useful to run the server in a debugger when invesigating test
|
||||
failures. To do this, the environment variable `DEBUG_EXTERNAL=1` can be set
|
||||
which will cause the test to skip starting a llama-server itself. Instead, the
|
||||
server can be started in a debugger.
|
||||
|
||||
Example using `gdb`:
|
||||
```console
|
||||
$ gdb --args ../../../build/bin/llama-server \
|
||||
--host 127.0.0.1 --port 8080 \
|
||||
--temp 0.8 --seed 42 \
|
||||
--hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf \
|
||||
--batch-size 32 --no-slots --alias tinyllama-2 --ctx-size 512 \
|
||||
--parallel 2 --n-predict 64
|
||||
```
|
||||
And a break point can be set in before running:
|
||||
```console
|
||||
(gdb) br server.cpp:4604
|
||||
(gdb) r
|
||||
main: server is listening on http://127.0.0.1:8080 - starting the main loop
|
||||
srv update_slots: all slots are idle
|
||||
```
|
||||
|
||||
And then the test in question can be run in another terminal:
|
||||
```console
|
||||
(venv) $ env DEBUG_EXTERNAL=1 ./tests.sh unit/test_chat_completion.py -v -x
|
||||
```
|
||||
And this should trigger the breakpoint and allow inspection of the server state
|
||||
in the debugger terminal.
|
||||
|
||||
@@ -99,8 +99,12 @@ class ServerProcess:
|
||||
self.debug = True
|
||||
if "PORT" in os.environ:
|
||||
self.server_port = int(os.environ["PORT"])
|
||||
self.external_server = "DEBUG_EXTERNAL" in os.environ
|
||||
|
||||
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
if self.external_server:
|
||||
print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
|
||||
return
|
||||
if self.server_path is not None:
|
||||
server_path = self.server_path
|
||||
elif "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
@@ -244,6 +248,9 @@ class ServerProcess:
|
||||
raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
|
||||
|
||||
def stop(self) -> None:
|
||||
if self.external_server:
|
||||
print("[external_server]: Not stopping external server")
|
||||
return
|
||||
if self in server_instances:
|
||||
server_instances.remove(self)
|
||||
if self.process:
|
||||
|
||||
+40
-28
@@ -1368,34 +1368,6 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
|
||||
return std::to_string(hash);
|
||||
}
|
||||
|
||||
|
||||
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
|
||||
static server_tokens format_rerank(const struct llama_vocab * vocab, server_tokens & query, server_tokens & doc) {
|
||||
server_tokens result = {};
|
||||
|
||||
// Get EOS token - use SEP token as fallback if EOS is not available
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
if (eos_token == LLAMA_TOKEN_NULL) {
|
||||
eos_token = llama_vocab_sep(vocab);
|
||||
}
|
||||
if (llama_vocab_get_add_bos(vocab)) {
|
||||
result.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
result.push_back(query);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
if (llama_vocab_get_add_sep(vocab)) {
|
||||
result.push_back(llama_vocab_sep(vocab));
|
||||
}
|
||||
result.push_back(doc);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
|
||||
mtmd::bitmaps bitmaps;
|
||||
for (auto & file : files) {
|
||||
@@ -1501,3 +1473,43 @@ static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * voc
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
|
||||
static server_tokens format_rerank(const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, const std::string & query, const std::string & doc) {
|
||||
server_tokens result = {};
|
||||
|
||||
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
|
||||
|
||||
if (rerank_prompt != nullptr) {
|
||||
std::string prompt = rerank_prompt;
|
||||
string_replace_all(prompt, "{query}" , query);
|
||||
string_replace_all(prompt, "{document}", doc );
|
||||
server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true);
|
||||
result.push_back(tokens);
|
||||
} else {
|
||||
// Get EOS token - use SEP token as fallback if EOS is not available
|
||||
server_tokens query_tokens = tokenize_input_subprompt(vocab, mctx, query, false, false);
|
||||
server_tokens doc_tokens = tokenize_input_subprompt(vocab, mctx, doc, false, false);
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
if (eos_token == LLAMA_TOKEN_NULL) {
|
||||
eos_token = llama_vocab_sep(vocab);
|
||||
}
|
||||
|
||||
if (llama_vocab_get_add_bos(vocab)) {
|
||||
result.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
result.push_back(query_tokens);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
if (llama_vocab_get_add_sep(vocab)) {
|
||||
result.push_back(llama_vocab_sep(vocab));
|
||||
}
|
||||
result.push_back(doc_tokens);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Vendored
+4318
-2039
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user