mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-30 17:47:40 +02:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 00f35d509e | |||
| 6028bf7435 | |||
| bc5182272c | |||
| e71d48e326 | |||
| b0493156fa | |||
| f4586ee598 | |||
| 60a7658810 | |||
| efe3a90996 | |||
| bbd57b7eaf | |||
| 25ff6f7659 | |||
| be48528b06 | |||
| cf9e5648a7 | |||
| fba5c0d680 | |||
| 53d0a12658 | |||
| 27093afe78 | |||
| 228f724d9c |
@@ -0,0 +1,52 @@
|
||||
name: "Copilot Setup Steps"
|
||||
|
||||
# Automatically run the setup steps when they are changed to allow for easy validation, and
|
||||
# allow manual testing through the repository's "Actions" tab
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
paths:
|
||||
- .github/workflows/copilot-setup-steps.yml
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/copilot-setup-steps.yml
|
||||
|
||||
jobs:
|
||||
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
|
||||
copilot-setup-steps:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
# Set the permissions to the lowest permissions possible needed for your steps.
|
||||
# Copilot will be given its own token for its operations.
|
||||
permissions:
|
||||
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
|
||||
contents: read
|
||||
|
||||
# You can define any steps you want, and they will run before the agent starts.
|
||||
# If you do not check out your code, Copilot will do this for you.
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: copilot-setup-steps
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/activate
|
||||
pip install -r requirements/requirements-all.txt
|
||||
@@ -240,7 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
<details>
|
||||
<summary>Infrastructure</summary>
|
||||
|
||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||
- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
|
||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
|
||||
|
||||
+1
-5
@@ -2949,11 +2949,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
||||
"(default: auto)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
params.reasoning_format = common_reasoning_format_from_name(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -552,6 +552,17 @@ common_chat_templates_ptr common_chat_templates_init(
|
||||
default_template_src = CHATML_TEMPLATE_SRC;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
|
||||
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
|
||||
if (default_template_src.find("<|channel|>") != std::string::npos
|
||||
// search for the error message and patch it
|
||||
&& default_template_src.find("in message.content or") != std::string::npos) {
|
||||
string_replace_all(default_template_src,
|
||||
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
|
||||
"{%- if false %}");
|
||||
}
|
||||
|
||||
std::string token_bos = bos_token_override;
|
||||
std::string token_eos = eos_token_override;
|
||||
bool add_bos = false;
|
||||
@@ -625,6 +636,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
||||
}
|
||||
}
|
||||
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
|
||||
if (format == "none") {
|
||||
return COMMON_REASONING_FORMAT_NONE;
|
||||
} else if (format == "auto") {
|
||||
return COMMON_REASONING_FORMAT_AUTO;
|
||||
} else if (format == "deepseek") {
|
||||
return COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
} else if (format == "deepseek-legacy") {
|
||||
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
||||
}
|
||||
throw std::runtime_error("Unknown reasoning format: " + format);
|
||||
}
|
||||
|
||||
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
|
||||
std::string arguments;
|
||||
if (builder.is_partial()) {
|
||||
|
||||
@@ -191,6 +191,7 @@ std::string common_chat_format_example(
|
||||
|
||||
const char* common_chat_format_name(common_chat_format format);
|
||||
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||
|
||||
@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
||||
+245
-380
@@ -753,69 +753,55 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
if (ggml_are_same_shape(src0, dst)) {
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
} else {
|
||||
aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
} else {
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
return;
|
||||
} else {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(),
|
||||
ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
ggml_cann_release_resources(ctx, src_trans_tensor);
|
||||
return;
|
||||
}
|
||||
} else if (ggml_is_contiguous(dst)) {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
void* src_trans_buffer = src0->data;
|
||||
ggml_cann_pool_alloc src_buffer_allocator;
|
||||
if (!ggml_is_contiguous(src0)) {
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
src_buffer_allocator.alloc(ctx.pool(),
|
||||
ggml_nelements(src0) * ggml_type_size(src0->type));
|
||||
src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
src_trans_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
src_trans_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
ggml_cann_release_resources(ctx, src_trans_tensor);
|
||||
return;
|
||||
} else {
|
||||
GGML_ABORT("Unsupport dst is not tontiguous.");
|
||||
cann_copy(ctx, acl_src, src_trans_tensor);
|
||||
ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
|
||||
}
|
||||
|
||||
size_t src_reshape_nb[GGML_MAX_DIMS];
|
||||
src_reshape_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
|
||||
}
|
||||
|
||||
aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer,
|
||||
ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
|
||||
dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, trans_acl_src, acl_dst);
|
||||
} else {
|
||||
aclnn_cast(ctx, trans_acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
ggml_cann_release_resources(ctx, trans_acl_src, acl_dst);
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1330,160 +1316,196 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
|
||||
* @details This function implements the Alibi mechanism, which introduces
|
||||
* learnable biases into the attention scores to simulate relative
|
||||
* position encoding without the need for explicit positional
|
||||
* embeddings.
|
||||
* @brief Generate a range of values and apply a scalar base exponentiation.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param acl_src The source tensor representing the query or key.
|
||||
* @param acl_position The position tensor containing relative positions.
|
||||
* @param acl_dst The destination tensor where the result will be stored.
|
||||
* @param n_head The number of attention heads.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb0 The byte size of the first dimension of the source
|
||||
tensor.
|
||||
* @param max_bias The maximum bias value used in the Alibi mechanism.
|
||||
* @param dst The destination tensor object for additional metadata.
|
||||
* This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
|
||||
* with step size `step`, stores it in a temporary buffer, and then computes:
|
||||
*
|
||||
* The function performs the following steps:
|
||||
* 1. Calculates the logarithm floor of the number of heads to determine the
|
||||
base for bias calculation.
|
||||
* 2. Initializes arrays with arithmetic sequences and fills them with bias
|
||||
values.
|
||||
* 3. Computes the bias tensor based on the calculated biases and arithmetic
|
||||
sequences.
|
||||
* 4. Reshapes the bias tensor to match the dimensions of the input tensors.
|
||||
* 5. Multiplies the position tensor by the bias tensor.
|
||||
* 6. Adds the result of the multiplication to the source tensor to produce the
|
||||
final output.
|
||||
* @f[
|
||||
* slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
|
||||
* @f]
|
||||
*
|
||||
* The results are written to the provided @p slope_buffer.
|
||||
*
|
||||
* @param ctx CANN backend context for memory allocation and operator execution.
|
||||
* @param slope_buffer Pointer to the output buffer (float array) for the computed slope values.
|
||||
* @param m Scalar base for the exponentiation.
|
||||
* @param size Number of elements in the generated sequence.
|
||||
* @param start Starting exponent offset.
|
||||
* @param stop Stopping exponent offset (exclusive).
|
||||
* @param step Step size for the exponent increment.
|
||||
*/
|
||||
static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_position, aclTensor* acl_dst,
|
||||
const int n_head, int64_t* src_ne, const size_t src_nb0,
|
||||
float max_bias, ggml_tensor* dst) {
|
||||
const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
|
||||
GGML_ASSERT(src_nb0 == sizeof(float));
|
||||
GGML_ASSERT(n_head == src_ne[2]);
|
||||
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
|
||||
float m, int64_t size, float start, float stop, float step){
|
||||
int64_t ne[] = {size};
|
||||
size_t nb[] = {sizeof(float)};
|
||||
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(float));
|
||||
void* arange_buffer = arange_allocator.get();
|
||||
|
||||
float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
aclTensor* arange_tensor = ggml_cann_create_tensor(
|
||||
arange_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
|
||||
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
|
||||
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * ggml_type_size(dst->type));
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
aclTensor* slope_tensor = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
|
||||
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
|
||||
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {sizeof(dst->type)};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * ggml_type_size(dst->type),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
}
|
||||
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * ggml_type_size(dst->type));
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * ggml_type_size(dst->type),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
|
||||
// acl_position * mk
|
||||
int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
|
||||
size_t tmp_output_nb[GGML_MAX_DIMS];
|
||||
tmp_output_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
|
||||
}
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
|
||||
void* tmp_output_buffer = output_allocator.get();
|
||||
aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
|
||||
tmp_output_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
|
||||
|
||||
// add
|
||||
aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
|
||||
ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
|
||||
}
|
||||
|
||||
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
/**
|
||||
* @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
|
||||
*
|
||||
* This function generates slope values for each attention head according to the ALiBi
|
||||
* (Attention with Linear Biases) method. It splits the computation into two ranges depending
|
||||
* on whether the head index is less than @p n_head_log2 or not, and uses different base values
|
||||
* (`m0` and `m1`) for the exponentiation.
|
||||
*
|
||||
* @f[
|
||||
* slope[h] =
|
||||
* \begin{cases}
|
||||
* m_0^{(h + 1)}, & h < n\_head\_log2 \\
|
||||
* m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
|
||||
* \end{cases}
|
||||
* \quad , \quad \text{if } max\_bias > 0
|
||||
* @f]
|
||||
*
|
||||
* If @p max_bias <= 0, all slope values are set to 1.0.
|
||||
*
|
||||
* @param ctx CANN backend context for memory allocation and operator execution.
|
||||
* @param n_head Total number of attention heads.
|
||||
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
|
||||
* @param max_bias Maximum bias value for slope computation.
|
||||
*
|
||||
*/
|
||||
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
||||
void* slope_buffer, float max_bias) {
|
||||
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
||||
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// const float slope = (max_bias > 0.0f) ?
|
||||
// h < n_head_log2 ?
|
||||
// powf(m0, h + 1) :
|
||||
// powf(m1, 2*(h - n_head_log2) + 1) :
|
||||
// 1.0f;
|
||||
// arange1
|
||||
float start = 0 + 1;
|
||||
float end = (n_head_log2 - 1) + 1;
|
||||
float step = 1;
|
||||
float count = n_head_log2;
|
||||
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
|
||||
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
|
||||
if (n_head_log2 < n_head) {
|
||||
// arange2
|
||||
start = 2 * (n_head_log2 - n_head_log2) + 1;
|
||||
end = 2 * ((n_head - 1) - n_head_log2) + 1;
|
||||
step = 2;
|
||||
count = n_head - n_head_log2;
|
||||
aclnn_get_slope_inner(
|
||||
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
|
||||
m1, count, start, end + 1, step);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
|
||||
*
|
||||
* This function computes the ALiBi slopes for each attention head (if max_bias > 0),
|
||||
* multiplies them with the attention mask to produce bias tensors, and adds these biases
|
||||
* to the destination tensor (@p dst).
|
||||
*
|
||||
* The function performs necessary broadcasting of the mask and slope tensors to match
|
||||
* the shape of the destination tensor, then applies element-wise multiplication and addition
|
||||
* using CANN operators.
|
||||
*
|
||||
* @param ctx CANN backend context for memory management and operator execution.
|
||||
* @param mask Input attention mask tensor, assumed to be contiguous.
|
||||
* @param dst Destination tensor to which ALiBi biases will be added.
|
||||
* @param dst_ptr Pointer to the memory of the destination tensor.
|
||||
* @param max_bias Maximum bias value controlling the slope scaling.
|
||||
*
|
||||
* @note
|
||||
* - Write data into dst_ptr using only the shape information of the dst tensor.
|
||||
* - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
|
||||
*/
|
||||
static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
|
||||
ggml_tensor* dst, void* dst_ptr, float max_bias) {
|
||||
void* slope_buffer = nullptr;
|
||||
void* bias_buffer = nullptr;
|
||||
|
||||
if (max_bias > 0.0f) {
|
||||
int64_t n_heads = dst->ne[2];
|
||||
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
|
||||
slope_buffer = slope_allocator.get();
|
||||
ggml_cann_pool_alloc bias_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
|
||||
bias_buffer = bias_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
|
||||
}
|
||||
|
||||
// broadcast for mask, slop and dst;
|
||||
int64_t nr2 = dst->ne[2] / mask->ne[2];
|
||||
int64_t nr3 = dst->ne[3] / mask->ne[3];
|
||||
|
||||
// broadcast the mask across rows
|
||||
int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
|
||||
size_t mask_nb[] = {
|
||||
mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
|
||||
mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
|
||||
};
|
||||
|
||||
int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
|
||||
size_t dst_nb[] = {
|
||||
dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
|
||||
dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
|
||||
};
|
||||
|
||||
// slope is a 1 dim tensor, slope.ne2 == dst.ne2
|
||||
int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
|
||||
size_t slope_nb[GGML_MAX_DIMS + 2];
|
||||
slope_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
|
||||
slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
|
||||
}
|
||||
|
||||
aclTensor* acl_slope = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float),
|
||||
slope_ne, slope_nb, GGML_MAX_DIMS + 2);
|
||||
aclTensor* acl_mask = ggml_cann_create_tensor(
|
||||
mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
|
||||
|
||||
// write data into dst_ptr using only the shape information of the dst tensor.
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(
|
||||
dst_ptr, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dst_ne, dst_nb,
|
||||
GGML_MAX_DIMS + 2);
|
||||
|
||||
if (max_bias > 0.0f) {
|
||||
int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
|
||||
size_t bias_nb[GGML_MAX_DIMS + 2];
|
||||
bias_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
|
||||
bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
|
||||
}
|
||||
aclTensor* bias_tensor = ggml_cann_create_tensor(
|
||||
bias_buffer, ACL_FLOAT, sizeof(float),
|
||||
bias_ne, bias_nb, GGML_MAX_DIMS + 2);
|
||||
|
||||
aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
|
||||
aclnn_add(ctx, acl_dst, bias_tensor);
|
||||
ggml_cann_release_resources(ctx, bias_tensor);
|
||||
} else {
|
||||
aclnn_add(ctx, acl_dst, acl_mask);
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cann_dup(ctx, dst);
|
||||
}
|
||||
|
||||
@@ -1501,118 +1523,41 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
* @param acl_dst The destination tensor where the softmax results will be
|
||||
* stored.
|
||||
*/
|
||||
static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
int64_t dim, aclTensor* acl_dst) {
|
||||
static void aclnn_softmax(ggml_backend_cann_context & ctx,
|
||||
aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1]; // mask
|
||||
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
float scale = 1.0f;
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
// input mul scale
|
||||
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
|
||||
ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
|
||||
void* src_tensor_buffer = src_tensor_allocator.get();
|
||||
aclTensor* softmax_tensor = ggml_cann_create_tensor(
|
||||
src_tensor_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);
|
||||
|
||||
size_t n_bytes = ggml_nbytes(src0);
|
||||
ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
|
||||
void* input_mul_scale_buffer = mul_scale_allocator.get();
|
||||
aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
|
||||
input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
|
||||
src0->nb, GGML_MAX_DIMS);
|
||||
|
||||
bool inplace = false;
|
||||
aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
|
||||
aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
|
||||
|
||||
// mask
|
||||
aclTensor* acl_src1_fp32_tensor = nullptr;
|
||||
aclTensor* tmp_mask_tensor = nullptr;
|
||||
ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
|
||||
if (src1) {
|
||||
const bool use_f16 = src1->type == GGML_TYPE_F16;
|
||||
if (use_f16) {
|
||||
// cast to fp32
|
||||
size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
|
||||
size_t src1_fp32_nb[GGML_MAX_DIMS];
|
||||
src1_fp32_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
|
||||
}
|
||||
src1_fp32_allocator.alloc(n_bytes);
|
||||
void* src1_fp32_buffer = src1_fp32_allocator.get();
|
||||
acl_src1_fp32_tensor = ggml_cann_create_tensor(
|
||||
src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
|
||||
src1_fp32_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||
aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
|
||||
ggml_cann_release_resources(ctx, acl_src1);
|
||||
} else {
|
||||
acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
|
||||
}
|
||||
|
||||
// broadcast the mask across rows, only use ne11 of ne01 in mask
|
||||
if (src1->ne[1] != src0->ne[1]) {
|
||||
// mask shape: [1,1,ne11,ne10]
|
||||
int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
|
||||
size_t tmp_mask_nb[GGML_MAX_DIMS];
|
||||
tmp_mask_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
|
||||
}
|
||||
tmp_mask_tensor = ggml_cann_create_tensor(
|
||||
src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
|
||||
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
}
|
||||
|
||||
// alibi
|
||||
const int n_head = src0->ne[2];
|
||||
const size_t src_nb0 = src0->nb[0];
|
||||
|
||||
n_bytes = ggml_nbytes(dst);
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
|
||||
void* output_buffer = output_allocator.get();
|
||||
aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
|
||||
output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
|
||||
dst->nb, GGML_MAX_DIMS);
|
||||
if (max_bias <= 0.0f) {
|
||||
// slope = 1.0
|
||||
if (tmp_mask_tensor) {
|
||||
aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
|
||||
alibi_output_tensor);
|
||||
} else {
|
||||
aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
|
||||
alibi_output_tensor);
|
||||
}
|
||||
} else {
|
||||
// slope != 1.0
|
||||
if (tmp_mask_tensor) {
|
||||
aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
|
||||
alibi_output_tensor, n_head, src0->ne, src_nb0,
|
||||
max_bias, dst);
|
||||
} else {
|
||||
aclnn_alibi(ctx, acl_input_mul_scale_tensor,
|
||||
acl_src1_fp32_tensor, alibi_output_tensor, n_head,
|
||||
src0->ne, src_nb0, max_bias, dst);
|
||||
}
|
||||
}
|
||||
|
||||
// softmax
|
||||
aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
|
||||
ggml_cann_release_resources(ctx, alibi_output_tensor);
|
||||
} else {
|
||||
aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
|
||||
aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
|
||||
}
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
|
||||
acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
|
||||
// softmax
|
||||
aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -3208,104 +3153,24 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
// Compute the slope if needed. Derived from ggml_cann_softmax().
|
||||
if(maxBias != 0.0f){
|
||||
// alibi
|
||||
const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
|
||||
const int64_t n_head = src0->ne[2];
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
const int64_t n_heads = src0->ne[2];
|
||||
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
|
||||
void* slope_buffer = slope_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
|
||||
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {faElemSize};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
int64_t slope_ne[] = {1, 1, n_heads, 1};
|
||||
size_t slope_nb[GGML_MAX_DIMS];
|
||||
slope_nb[0] = sizeof(float);
|
||||
for(int i = 1;i<GGML_MAX_DIMS;i++) {
|
||||
slope_nb[i] = slope_nb[i-1] * slope_ne[0];
|
||||
}
|
||||
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* slope_tensor = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float),
|
||||
slope_ne, slope_nb, GGML_MAX_DIMS);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
|
||||
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
|
||||
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor);
|
||||
ggml_cann_release_resources(ctx, slope_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2456,8 +2456,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// value of paddingW should be at most half of kernelW
|
||||
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
||||
}
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_REPEAT:
|
||||
@@ -2503,9 +2503,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
if (op->src[2]) {
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:{
|
||||
// derived from [ggml-cuda.cu]
|
||||
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
||||
@@ -2532,11 +2530,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
|
||||
if(logitSoftcap != 0.0f) {
|
||||
|
||||
@@ -40,18 +40,22 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
@@ -80,12 +84,14 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -103,12 +109,14 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -133,11 +141,13 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -164,12 +174,14 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
@@ -195,10 +207,12 @@
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
+200
-51
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
{
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
||||
|
||||
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||
//GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
GGML_ASSERT(interleave_block == 4);
|
||||
|
||||
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
||||
|
||||
block_iq4_nl dst_tmp[4];
|
||||
|
||||
int nrow = ggml_nrows(t);
|
||||
int nrows_interleaved = 4;
|
||||
int nblocks = t->ne[0] / QK4_0;
|
||||
int nblocks = t->ne[0] / QK4_NL;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||
|
||||
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
||||
block_iq4_nlx8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
const int end = QK4_NL * 4 / blck_size_interleave;
|
||||
|
||||
if (blck_size_interleave == 8) {
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
||||
|
||||
block_iq4_nl dst_tmp[8];
|
||||
|
||||
int nrow = ggml_nrows(t);
|
||||
int nrows_interleaved = 8;
|
||||
int nblocks = t->ne[0] / QK4_NL;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||
|
||||
if (t->ne[1] % nrows_interleaved != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||
for (int64_t x = 0; x < nblocks; x++) {
|
||||
for (int i = 0; i < nrows_interleaved; i++) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
return 0;
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::repack {
|
||||
// repack
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
|
||||
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
||||
//}
|
||||
|
||||
template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
// gemv
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
// gemm
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
|
||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
||||
public:
|
||||
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
||||
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
|
||||
// instance for IQ4
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
||||
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &iq4_nl_8x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &iq4_nl_4x4_q8_0;
|
||||
|
||||
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
struct block_iq4_nlx8 {
|
||||
ggml_half d[8]; // deltas for 8 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
|
||||
@@ -120,6 +120,10 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
||||
if (GGML_CUDA_DEBUG)
|
||||
list(APPEND CUDA_FLAGS -lineinfo)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
# - none (not recommended)
|
||||
|
||||
@@ -87,6 +87,10 @@
|
||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
|
||||
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
# define GGML_CUDA_USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
constexpr bool ggml_cuda_has_arch_impl(int) {
|
||||
return false;
|
||||
@@ -312,11 +316,11 @@ static bool turing_mma_available(const int cc) {
|
||||
}
|
||||
|
||||
static bool ampere_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
}
|
||||
|
||||
static bool cp_async_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
@@ -420,26 +424,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
||||
template<bool norm>
|
||||
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
|
||||
const int row = blockIdx.x;
|
||||
const int col = threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = col; i < ncols; i += blockDim.x) {
|
||||
sum += x[row * ncols + i];
|
||||
}
|
||||
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (col != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[row] = norm ? sum / ncols : sum;
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_all(int x) {
|
||||
#ifdef GGML_USE_HIP
|
||||
|
||||
@@ -15,7 +15,6 @@ namespace wmma = mtmusa::wmma;
|
||||
namespace wmma = nvcuda::wmma;
|
||||
#endif // GGML_USE_MUSA
|
||||
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
|
||||
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
||||
#include <rocwmma/rocwmma.hpp>
|
||||
namespace wmma = rocwmma;
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
|
||||
@@ -1,4 +1,14 @@
|
||||
#include "mean.cuh"
|
||||
#include "reduce_rows.cuh"
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
template <typename T> __global__ void divide_by_count(T * result, size_t count) {
|
||||
*result /= static_cast<T>(count);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
@@ -13,7 +23,45 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
// Special case for reducing vectors
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
cudaStreamCaptureStatus iscapturing;
|
||||
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
|
||||
if ((nrows == 1) &&
|
||||
// CUDA_GRAPHS_DISABLED
|
||||
((ncols > 65536) &&
|
||||
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
|
||||
// CUDA_GRAPHS ENABLED
|
||||
((ncols > 32768) &&
|
||||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
||||
// Single row - use device-wide reduction
|
||||
size_t tmp_size = 0;
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
|
||||
DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream);
|
||||
|
||||
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream);
|
||||
|
||||
// Divide by ncols
|
||||
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
if ((nrows / nsm) < 2) {
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
} else {
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
#include "common.cuh"
|
||||
|
||||
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
||||
template <bool norm>
|
||||
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
|
||||
const int row = blockIdx.x;
|
||||
const int col = threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
const int num_unroll = 8;
|
||||
float temp[num_unroll];
|
||||
float sum_temp[num_unroll] = { 0.0f };
|
||||
for (int i = col; i < ncols;) {
|
||||
for (int j = 0; j < num_unroll; ++j) {
|
||||
if (i < ncols) {
|
||||
temp[j] = x[row * ncols + i];
|
||||
} else {
|
||||
temp[j] = 0;
|
||||
}
|
||||
i += blockDim.x;
|
||||
}
|
||||
for (int j = 0; j < num_unroll; ++j) {
|
||||
sum_temp[j] += temp[j];
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < num_unroll; ++j) {
|
||||
sum += sum_temp[j];
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
sum = warp_reduce_sum(sum);
|
||||
if (blockDim.x > WARP_SIZE) {
|
||||
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = sum;
|
||||
}
|
||||
__syncthreads();
|
||||
sum = 0.0f;
|
||||
if (lane_id < (blockDim.x / WARP_SIZE)) {
|
||||
sum = s_sum[lane_id];
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
}
|
||||
|
||||
if (col != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[row] = norm ? sum / ncols : sum;
|
||||
}
|
||||
@@ -1,19 +1,15 @@
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
#define USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
#include "sum.cuh"
|
||||
#include "sumrows.cuh"
|
||||
|
||||
#ifdef USE_CUB
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // USE_CUB
|
||||
|
||||
#include "sumrows.cuh"
|
||||
#include "sum.cuh"
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
|
||||
#ifdef USE_CUB
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
size_t tmp_size = 0;
|
||||
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
|
||||
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||
@@ -23,7 +19,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
|
||||
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
|
||||
sum_rows_f32_cuda(x, dst, ne, 1, stream);
|
||||
GGML_UNUSED(pool);
|
||||
#endif // USE_CUB
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -1,9 +1,17 @@
|
||||
#include "reduce_rows.cuh"
|
||||
#include "sumrows.cuh"
|
||||
|
||||
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
if ((nrows / nsm) < 2) {
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
} else {
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -19,8 +27,17 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
if ((nrows / nsm) < 2) {
|
||||
// Increase num threads to 512 for small nrows to better hide the latency
|
||||
const dim3 block_dims(512, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
} else {
|
||||
// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
|
||||
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
|
||||
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
}
|
||||
}
|
||||
|
||||
Vendored
+1
-1
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
|
||||
#define HIP_DISABLE_WARP_SYNC_BUILTINS 1
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
@@ -2481,6 +2481,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_SCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_ADD:
|
||||
if (op->type == GGML_TYPE_F16) {
|
||||
const bool src0_ok = op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32;
|
||||
const bool src1_ok = op->src[1]->type == GGML_TYPE_F16 || op->src[1]->type == GGML_TYPE_F32;
|
||||
if (src0_ok && src1_ok) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SUB:
|
||||
@@ -3717,34 +3724,30 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3]; UNUSED(ne13);
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
||||
const cl_ulong nb13 = src1->nb[3];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
@@ -3761,68 +3764,114 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
bool bcast_row = false;
|
||||
cl_kernel kernel;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
const bool bcast_row = ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0;
|
||||
|
||||
if (bcast_row) {
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
}
|
||||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
|
||||
if (bcast_row) {
|
||||
kernel = backend_ctx->kernel_add_row;
|
||||
const int ne = ne00 / 4;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add_row_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_add;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
|
||||
const int type_src0 = (src0->type == GGML_TYPE_F32);
|
||||
const int type_src1 = (src1->type == GGML_TYPE_F32);
|
||||
if (bcast_row) {
|
||||
kernel = backend_ctx->kernel_add_row_f16;
|
||||
const int ne = ne00 / 4;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &type_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &type_src1));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add_f16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &type_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(int), &type_src1));
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
|
||||
} else {
|
||||
GGML_ASSERT(false && "unsupported data types for add");
|
||||
}
|
||||
|
||||
if (bcast_row) {
|
||||
@@ -3832,13 +3881,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size_ptr, dst);
|
||||
} else {
|
||||
unsigned int nth = MIN(64, ne0);
|
||||
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
@@ -112,7 +112,9 @@ kernel void kernel_add_f16(
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
ulong nb3,
|
||||
int type_src0,
|
||||
int type_src1
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
@@ -132,25 +134,57 @@ kernel void kernel_add_f16(
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) + *((global half *)(src1_ptr + i10*nb10));
|
||||
|
||||
half v0, v1;
|
||||
if (type_src0 == 1) {
|
||||
v0 = convert_half(*((global float *)(src0_ptr + i0*nb00)));
|
||||
} else {
|
||||
v0 = *((global half *)(src0_ptr + i0*nb00));
|
||||
}
|
||||
|
||||
if (type_src1 == 1) {
|
||||
v1 = convert_half(*((global float *)(src1_ptr + i10*nb10)));
|
||||
} else {
|
||||
v1 = *((global half *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
|
||||
*((global half *)(dst_ptr + i0*nb0)) = v0 + v1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_add_row_f16(
|
||||
global half4 * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global half4 * src1,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global half4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
int ne,
|
||||
int type_src0,
|
||||
int type_src1
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
src1 = (global half4*)((global char*)src1 + offset1);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] + src1[idx1];
|
||||
|
||||
half4 v0, v1;
|
||||
if (type_src0 == 1) {
|
||||
global float4* src0_f32 = (global float4*)((global char*)src0 + offset0);
|
||||
v0 = convert_half4(src0_f32[gid]);
|
||||
} else {
|
||||
global half4* src0_f16 = (global half4*)((global char*)src0 + offset0);
|
||||
v0 = src0_f16[gid];
|
||||
}
|
||||
|
||||
if (type_src1 == 1) {
|
||||
global float4* src1_f32 = (global float4*)((global char*)src1 + offset1);
|
||||
v1 = convert_half4(src1_f32[idx1]);
|
||||
} else {
|
||||
global half4* src1_f16 = (global half4*)((global char*)src1 + offset1);
|
||||
v1 = src1_f16[idx1];
|
||||
}
|
||||
|
||||
dst[gid] = v0 + v1;
|
||||
}
|
||||
|
||||
@@ -29,9 +29,12 @@
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <filesystem>
|
||||
#include <algorithm>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef SOCKET sockfd_t;
|
||||
using ssize_t = __int64;
|
||||
@@ -323,11 +326,14 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||
static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
|
||||
size_t bytes_sent = 0;
|
||||
while (bytes_sent < size) {
|
||||
ssize_t n = send(sockfd, (const char *)data + bytes_sent, size - bytes_sent, 0);
|
||||
size_t size_to_send = std::min(size - bytes_sent, MAX_CHUNK_SIZE);
|
||||
ssize_t n = send(sockfd, (const char *)data + bytes_sent, size_to_send, 0);
|
||||
if (n < 0) {
|
||||
GGML_LOG_ERROR("send failed (bytes_sent=%zu, size_to_send=%zu)\n",
|
||||
bytes_sent, size_to_send);
|
||||
return false;
|
||||
}
|
||||
bytes_sent += n;
|
||||
bytes_sent += (size_t)n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -335,11 +341,18 @@ static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
|
||||
static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||
size_t bytes_recv = 0;
|
||||
while (bytes_recv < size) {
|
||||
ssize_t n = recv(sockfd, (char *)data + bytes_recv, size - bytes_recv, 0);
|
||||
if (n <= 0) {
|
||||
size_t size_to_recv = std::min(size - bytes_recv, MAX_CHUNK_SIZE);
|
||||
ssize_t n = recv(sockfd, (char *)data + bytes_recv, size_to_recv, 0);
|
||||
if (n < 0) {
|
||||
GGML_LOG_ERROR("recv failed (bytes_recv=%zu, size_to_recv=%zu)\n",
|
||||
bytes_recv, size_to_recv);
|
||||
return false;
|
||||
}
|
||||
bytes_recv += n;
|
||||
if (n == 0) {
|
||||
GGML_LOG_ERROR("recv returned 0 (peer closed?)\n");
|
||||
return false;
|
||||
}
|
||||
bytes_recv += (size_t)n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2705,9 +2705,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
" : converting src1 to fp16");
|
||||
|
||||
// iterate tensor dims and find the slowest moving dim and stride
|
||||
int64_t last_dim=0;
|
||||
int64_t last_str=0;
|
||||
int64_t largest_str=0;
|
||||
int last_dim=0;
|
||||
int last_str=0;
|
||||
size_t largest_str=0;
|
||||
for(int i = 0; i< 4; i++){
|
||||
// last stride is always the largest
|
||||
if(src1->nb[i] == largest_str){
|
||||
@@ -2783,7 +2783,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
auto launch_gemm_for_batches = [&ctx, queue](const sycl::half *src0,
|
||||
const sycl::half *src1, float *dst,
|
||||
int64_t a0, int64_t a1, int64_t batcha,
|
||||
int64_t b0, int64_t b1, int64_t batchb,
|
||||
int64_t /*b0*/, int64_t b1, int64_t batchb,
|
||||
int64_t sa0, int64_t sa1, int64_t sa2,
|
||||
int64_t sb0, int64_t sb1, int64_t sb2,
|
||||
int64_t sd2) {
|
||||
@@ -2832,14 +2832,26 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
}
|
||||
};
|
||||
|
||||
bool cont_batches_a = nb02 * ne02 == nb03;
|
||||
bool cont_batches_b = nb12 * ne12 == nb13;
|
||||
if (cont_batches_a && cont_batches_b) {
|
||||
const bool cont_batches_dim2_a = nb02 * ne02 == nb03;
|
||||
const bool cont_batches_dim2_b = nb12 * ne12 == nb13;
|
||||
const bool cont_batches_dim3_a = ne02 == 1 && nb02 * ne01 == nb03;
|
||||
const bool cont_batches_dim3_b = ne12 == 1 && nb12 * ne11 == nb13;
|
||||
if (cont_batches_dim2_a && cont_batches_dim2_b) {
|
||||
// A batch is considered contiguous if the dimension 2 is not strided
|
||||
int64_t batches0 = ne02 * ne03;
|
||||
int64_t batches1 = ne12 * ne13;
|
||||
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
|
||||
ne10, ne11, batches1, str_a0, str_a1, str_a2, str_b0, str_b1,
|
||||
str_b2, nb2 / sizeof(float));
|
||||
} else if (cont_batches_dim3_a && cont_batches_dim3_b) {
|
||||
// This case is similar to the one above with the difference that only the batch in dimension 3 is used and the dimension 2 is of size 1.
|
||||
int64_t batches0 = ne02 * ne03;
|
||||
int64_t batches1 = ne12 * ne13;
|
||||
int64_t str_a3 = nb03 / type_size_src0;
|
||||
int64_t str_b3 = nb13 / type_size_src1;
|
||||
launch_gemm_for_batches(src0_f16, src1_f16, dst_ddf, ne00, ne01, batches0,
|
||||
ne10, ne11, batches1, str_a0, str_a1, str_a3, str_b0, str_b1,
|
||||
str_b3, nb2 / sizeof(float));
|
||||
} else {
|
||||
for (int64_t b_a = 0; b_a < ne03; b_a++) {
|
||||
const sycl::half *src0_f16_shifted
|
||||
@@ -4215,6 +4227,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
// FIXME: keep a list of supported types to avoid breaking the backend when a new type is added
|
||||
return false;
|
||||
}
|
||||
// TODO: The configuration below needs more work to be supported with oneDNN
|
||||
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1) {
|
||||
return false;
|
||||
}
|
||||
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
||||
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
||||
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_OUT_PROD:
|
||||
|
||||
@@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
||||
|
||||
auto & cells = v_cells[seq_to_stream[seq_id]];
|
||||
auto & head = v_heads[seq_to_stream[seq_id]];
|
||||
|
||||
uint32_t new_head = cells.size();
|
||||
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
||||
|
||||
if (p0 < 0) {
|
||||
p0 = 0;
|
||||
@@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||
}
|
||||
|
||||
if (seq_id >= 0) {
|
||||
auto & cells = v_cells[seq_to_stream[seq_id]];
|
||||
auto & head = v_heads[seq_to_stream[seq_id]];
|
||||
|
||||
uint32_t new_head = cells.size();
|
||||
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
@@ -250,26 +250,38 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
if (new_head != cells.size() && new_head < head) {
|
||||
head = new_head;
|
||||
}
|
||||
} else {
|
||||
// match any sequence
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
auto & cells = v_cells[s];
|
||||
auto & head = v_heads[s];
|
||||
|
||||
uint32_t new_head = cells.size();
|
||||
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cells.rm(i);
|
||||
|
||||
if (new_head == cells.size()) {
|
||||
new_head = i;
|
||||
}
|
||||
}
|
||||
|
||||
cells.rm(i);
|
||||
|
||||
if (new_head == cells.size()) {
|
||||
new_head = i;
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
if (new_head != cells.size() && new_head < head) {
|
||||
head = new_head;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we freed up a slot, set head to it so searching can start there.
|
||||
if (new_head != cells.size() && new_head < head) {
|
||||
head = new_head;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -5998,6 +5998,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_sum());
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
test_cases.emplace_back(new test_mean());
|
||||
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
|
||||
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 }));
|
||||
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 }));
|
||||
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
|
||||
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
|
||||
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
|
||||
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 }));
|
||||
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
|
||||
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
|
||||
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
|
||||
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
|
||||
test_cases.emplace_back(new test_acc());
|
||||
@@ -6179,6 +6188,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 32, 4, n_token));
|
||||
}
|
||||
|
||||
std::vector<std::array<int64_t, 4>> reduce_rows_cases = {
|
||||
{ 8192, 1, 1, 1 },
|
||||
{ 8192, 8192, 1, 1 },
|
||||
{ 128, 8192, 1, 1 },
|
||||
};
|
||||
|
||||
for (auto it: reduce_rows_cases){
|
||||
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, it));
|
||||
test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, it));
|
||||
test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it));
|
||||
}
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
// audio-specific
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
|
||||
+33
-44
@@ -201,6 +201,7 @@ struct clip_hparams {
|
||||
// legacy
|
||||
bool has_llava_projector = false;
|
||||
int minicpmv_version = 0;
|
||||
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
@@ -866,21 +867,8 @@ struct clip_graph {
|
||||
int n_embd = clip_n_mmproj_embd(ctx);
|
||||
const int d_head = 128;
|
||||
int n_head = n_embd/d_head;
|
||||
int num_query = 96;
|
||||
if (ctx->model.hparams.minicpmv_version == 2) {
|
||||
// MiniCPM-V 2.5
|
||||
num_query = 96;
|
||||
} else if (ctx->model.hparams.minicpmv_version == 3) {
|
||||
// MiniCPM-V 2.6
|
||||
num_query = 64;
|
||||
} else if (ctx->model.hparams.minicpmv_version == 4) {
|
||||
// MiniCPM-o 2.6
|
||||
num_query = 64;
|
||||
} else if (ctx->model.hparams.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
num_query = 64;
|
||||
}
|
||||
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = ctx->model.hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||
model.mm_model_attn_q_b);
|
||||
@@ -2138,7 +2126,19 @@ struct clip_model_loader {
|
||||
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
||||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
||||
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
||||
|
||||
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
|
||||
if (hparams.minicpmv_query_num == 0) {
|
||||
// Fallback to hardcoded values for legacy models
|
||||
if (hparams.minicpmv_version == 3) {
|
||||
hparams.minicpmv_query_num = 64;
|
||||
} else if (hparams.minicpmv_version == 4) {
|
||||
hparams.minicpmv_query_num = 64;
|
||||
} else if (hparams.minicpmv_version == 5) {
|
||||
hparams.minicpmv_query_num = 64;
|
||||
} else {
|
||||
hparams.minicpmv_query_num = 96;
|
||||
}
|
||||
}
|
||||
} else if (is_audio) {
|
||||
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
||||
|
||||
@@ -3556,20 +3556,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
{
|
||||
if (params.minicpmv_version == 2) {
|
||||
// MiniCPM-V 2.5
|
||||
n_patches_sq = 96;
|
||||
} else if (params.minicpmv_version == 3) {
|
||||
// MiniCPM-V 2.6
|
||||
n_patches_sq = 64;
|
||||
} else if (params.minicpmv_version == 4) {
|
||||
// MiniCPM-o 2.6
|
||||
n_patches_sq = 64;
|
||||
} else if (params.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
n_patches_sq = 64;
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
if (params.minicpmv_query_num > 0) {
|
||||
n_patches_sq = params.minicpmv_query_num;
|
||||
} else {
|
||||
GGML_ABORT("Unknown minicpmv version");
|
||||
// Fallback to hardcoded values for legacy models
|
||||
if (params.minicpmv_version == 2) {
|
||||
n_patches_sq = 96;
|
||||
} else if (params.minicpmv_version == 3) {
|
||||
n_patches_sq = 64;
|
||||
} else if (params.minicpmv_version == 4) {
|
||||
n_patches_sq = 64;
|
||||
} else if (params.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
n_patches_sq = 64;
|
||||
} else {
|
||||
GGML_ABORT("Unknown minicpmv version");
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
@@ -4102,7 +4105,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
switch (ctx->model.proj_type) {
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
||||
@@ -4114,20 +4116,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
return ctx->model.mm_3_b->ne[0];
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
if (hparams.minicpmv_version == 2) {
|
||||
// MiniCPM-V 2.5
|
||||
return 4096;
|
||||
} else if (hparams.minicpmv_version == 3) {
|
||||
// MiniCPM-V 2.6
|
||||
return 3584;
|
||||
} else if (hparams.minicpmv_version == 4) {
|
||||
// MiniCPM-o 2.6
|
||||
return 3584;
|
||||
} else if (hparams.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
return 2560;
|
||||
}
|
||||
GGML_ABORT("Unknown minicpmv version");
|
||||
return ctx->model.mm_model_proj->ne[0];
|
||||
case PROJECTOR_TYPE_GLM_EDGE:
|
||||
return ctx->model.mm_model_mlp_3_w->ne[1];
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
|
||||
@@ -517,6 +517,16 @@ if args.use_f32:
|
||||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
# Read config.json to get actual model configuration
|
||||
config_path = os.path.join(dir_model, "config.json")
|
||||
model_config = {}
|
||||
if os.path.isfile(config_path):
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
model_config = json.load(f)
|
||||
print(f"Loaded config from {config_path}")
|
||||
else:
|
||||
print(f"Warning: config.json not found at {config_path}")
|
||||
|
||||
# If minicpmv_projector is not specified but the default path exists, use the default path
|
||||
if args.minicpmv_projector is None:
|
||||
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
|
||||
@@ -555,37 +565,62 @@ if args.use_f32:
|
||||
# processor = CLIPProcessor.from_pretrained(dir_model)
|
||||
|
||||
minicpmv_version = args.minicpmv_version
|
||||
emb_dim = 4096
|
||||
block_count = 26
|
||||
if minicpmv_version == 1: # MiniCPM-V 2.0
|
||||
emb_dim = 2304
|
||||
block_count = 26
|
||||
elif minicpmv_version == 2: # MiniCPM-V 2.5
|
||||
emb_dim = 4096
|
||||
block_count = 27
|
||||
elif minicpmv_version == 3: # MiniCPM-V 2.6
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 4: # MiniCPM-o 2.6
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 5: # MiniCPM-V 4.0
|
||||
emb_dim = 2560
|
||||
block_count = 27
|
||||
|
||||
default_vision_config = {
|
||||
"hidden_size": 1152,
|
||||
"image_size": 980,
|
||||
"intermediate_size": 4304,
|
||||
"model_type": "idefics2",
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 27,
|
||||
"patch_size": 14,
|
||||
# Use actual config values instead of hardcoded ones
|
||||
if model_config:
|
||||
# For the projector/resampler, use the main model's hidden_size
|
||||
emb_dim = model_config.get("hidden_size", 1536)
|
||||
|
||||
# For the vision model, use vision_config values
|
||||
vision_config_dict = model_config.get("vision_config", {})
|
||||
default_vision_config = {
|
||||
"hidden_size": vision_config_dict.get("hidden_size", 1152),
|
||||
"image_size": vision_config_dict.get("image_size", 980),
|
||||
"intermediate_size": vision_config_dict.get("intermediate_size", 4304),
|
||||
"model_type": vision_config_dict.get("model_type", "siglip"),
|
||||
"num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
|
||||
"num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
|
||||
"patch_size": vision_config_dict.get("patch_size", 14),
|
||||
}
|
||||
|
||||
# Use vision model's num_hidden_layers for block_count
|
||||
block_count = vision_config_dict.get("num_hidden_layers", 27)
|
||||
|
||||
print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
|
||||
print(f"Vision config: {default_vision_config}")
|
||||
else:
|
||||
# Fallback to original hardcoded logic if config.json not found
|
||||
emb_dim = 4096
|
||||
block_count = 26
|
||||
if minicpmv_version == 1:
|
||||
emb_dim = 2304
|
||||
block_count = 26
|
||||
elif minicpmv_version == 2:
|
||||
emb_dim = 4096
|
||||
block_count = 27
|
||||
elif minicpmv_version == 3:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 4:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 5:
|
||||
emb_dim = 2560
|
||||
block_count = 27
|
||||
|
||||
default_vision_config = {
|
||||
"hidden_size": 1152,
|
||||
"image_size": 980,
|
||||
"intermediate_size": 4304,
|
||||
"model_type": "idefics2",
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 27,
|
||||
"patch_size": 14,
|
||||
}
|
||||
|
||||
vision_config = Idefics2VisionConfig(**default_vision_config)
|
||||
model = Idefics2VisionTransformer(vision_config)
|
||||
if minicpmv_version == 3:
|
||||
if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
elif minicpmv_version == 4:
|
||||
@@ -644,16 +679,27 @@ else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
if has_vision_encoder:
|
||||
# vision_model hparams
|
||||
fout.add_uint32("clip.vision.image_size", 448)
|
||||
fout.add_uint32("clip.vision.patch_size", 14)
|
||||
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
|
||||
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
|
||||
# vision_model hparams - use actual config values
|
||||
vision_image_size = model_config.get("image_size", 448) if model_config else 448
|
||||
vision_patch_size = default_vision_config.get("patch_size", 14)
|
||||
vision_hidden_size = default_vision_config.get("hidden_size", 1152)
|
||||
vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
|
||||
vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
|
||||
|
||||
fout.add_uint32("clip.vision.image_size", vision_image_size)
|
||||
fout.add_uint32("clip.vision.patch_size", vision_patch_size)
|
||||
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
|
||||
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
|
||||
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
|
||||
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
|
||||
# Add MiniCPM-V specific parameters
|
||||
query_num = model_config.get("query_num", 0) if model_config else 0
|
||||
resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
|
||||
fout.add_uint32("clip.minicpmv_query_num", query_num)
|
||||
|
||||
if processor is not None:
|
||||
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||
|
||||
@@ -16,6 +16,8 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
|
||||
projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
|
||||
torch.save(projector, f"{args.model}/minicpmv.projector")
|
||||
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
|
||||
|
||||
@@ -1132,6 +1132,12 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
|
||||
|
||||
`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`
|
||||
|
||||
`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
|
||||
|
||||
`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
|
||||
|
||||
`parse_tool_calls`: Whether to parse the generated tool call.
|
||||
|
||||
*Examples:*
|
||||
|
||||
You can use either Python `openai` library with appropriate checkpoints:
|
||||
|
||||
Binary file not shown.
@@ -383,8 +383,12 @@ struct server_task {
|
||||
} else {
|
||||
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
|
||||
}
|
||||
params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
|
||||
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
|
||||
common_reasoning_format reasoning_format = params_base.reasoning_format;
|
||||
if (data.contains("reasoning_format")) {
|
||||
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
|
||||
}
|
||||
params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
|
||||
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
|
||||
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
|
||||
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
|
||||
}
|
||||
|
||||
@@ -209,6 +209,7 @@ export const AppContextProvider = ({
|
||||
messages,
|
||||
stream: true,
|
||||
cache_prompt: true,
|
||||
reasoning_format: 'none',
|
||||
samplers: config.samplers,
|
||||
temperature: config.temperature,
|
||||
dynatemp_range: config.dynatemp_range,
|
||||
|
||||
Reference in New Issue
Block a user