Compare commits

...

8 Commits

Author SHA1 Message Date
Xuan-Son Nguyen 13be08daf9 clip : remove boi/eoi embeddings for GLM-edge model (#13081) 2025-04-24 22:17:04 +02:00
Georgi Gerganov 226251ed56 embeddings : fix batch sizes (#13076)
ggml-ci
2025-04-24 22:29:22 +03:00
Georgi Gerganov 87616f0680 ggml : fix trailing whitespaces (#0) 2025-04-24 17:32:47 +03:00
Georgi Gerganov 63b4911494 sync : ggml
ggml-ci
2025-04-24 17:32:47 +03:00
Acly c6e8cc28c1 ggml : Depthwise 2D convolution (ggml/1152)
* ggml-cpu : kernels for faster depthwise 2D convolution

* fix compile: remove static after moving to ops.cpp

* add dilation for depthwise_conv_2d

* review: rename to ggml_conv_2d_dw_direct, remove redundant struct keywords, pass by ref, whitespace

* review: rename depthwise_conv_2d -> conv_2d_dw everywhere
2025-04-24 17:32:47 +03:00
Johannes Gäßler b10d8bfdb1 CUDA: use switch statements in constexpr functions (#13095) 2025-04-24 15:57:10 +02:00
Georgi Gerganov 13b4548877 cmake : do not include ./src as public for libllama (#13062)
* cmake : do not include ./src as public for libllama

ggml-ci

* cmake : rework tests

ggml-ci

* llguidance : remove unicode include

ggml-ci

* cmake : make c++17 private

ggml-ci
2025-04-24 16:00:10 +03:00
Georgi Gerganov 572b3141d3 clang-tidy : disable warning about missing math parenthesis (#13091) 2025-04-24 15:44:05 +03:00
30 changed files with 413 additions and 168 deletions
+1
View File
@@ -13,6 +13,7 @@ Checks: >
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
-2
View File
@@ -994,7 +994,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-embedding",
"llama-eval-callback",
"llama-export-lora",
"llama-gbnf-validator",
"llama-gen-docs",
"llama-gguf",
"llama-gguf-hash",
@@ -1014,7 +1013,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-perplexity",
"llama-q8dot",
"llama-quantize",
"llama-quantize-stats",
"llama-qwen2vl-cli",
"llama-retrieval",
"llama-run",
-9
View File
@@ -21,11 +21,6 @@ else()
add_subdirectory(embedding)
add_subdirectory(eval-callback)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(gbnf-validator)
endif()
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
@@ -58,10 +53,6 @@ else()
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(quantize-stats)
endif()
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)
+7 -1
View File
@@ -89,6 +89,13 @@ int main(int argc, char ** argv) {
common_init();
params.embedding = true;
// utilize the full context
if (params.n_batch < params.n_ctx) {
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
params.n_batch = params.n_ctx;
}
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
@@ -134,7 +141,6 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch >= params.n_ctx);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
-5
View File
@@ -1,5 +0,0 @@
set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
-2
View File
@@ -90,8 +90,6 @@
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type {
PROJECTOR_TYPE_MLP,
+1 -16
View File
@@ -244,8 +244,6 @@ struct clip_vision_model {
//GLMV-Edge projection
struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
struct ggml_tensor * boi_w = nullptr;
struct ggml_tensor * eoi_w = nullptr;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@@ -1697,8 +1695,6 @@ struct clip_model_loader {
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
} break;
case PROJECTOR_TYPE_MERGER:
{
@@ -2593,8 +2589,7 @@ void clip_free(clip_ctx * ctx) {
}
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2790,9 +2785,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
if (ctx->has_glm_projector) {
GGML_ASSERT(batch_size == 1);
ggml_tensor * boi = ctx->vision_model.boi_w;
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
}
// build the inference graph
@@ -3001,13 +2993,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
if (ctx->has_glm_projector) {
//eoi
ggml_tensor * eoi = ctx->vision_model.eoi_w;
int offset = ggml_nelements(embeddings);
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
}
return true;
}
+5
View File
@@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
-6
View File
@@ -1,6 +0,0 @@
set(TARGET llama-quantize-stats)
add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
+21 -1
View File
@@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
@@ -677,6 +678,9 @@ extern "C" {
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@@ -1660,7 +1664,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// depthwise
// depthwise (via im2col and mul_mat)
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
@@ -1672,6 +1676,22 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1);
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
struct ggml_context * ctx,
struct ggml_tensor * a,
+5
View File
@@ -1932,6 +1932,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32(params, tensor);
} break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -2268,6 +2272,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D:
{
+172
View File
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
}
}
// ggml_compute_forward_conv_2d_dw
struct ggml_conv_2d_dw_params {
int64_t channels;
int64_t batch;
int64_t src_w;
int64_t src_h;
int64_t dst_w;
int64_t dst_h;
int64_t knl_w;
int64_t knl_h;
int stride_x;
int stride_y;
int pad_x;
int pad_y;
int dilation_x;
int dilation_y;
};
static void ggml_compute_forward_conv_2d_dw_cwhn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t c = p.channels;
const float * knl_data = (const float *)kernel->data;
const int64_t rows_total = p.dst_h * p.batch;
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
const int64_t row_start = params->ith * rows_per_thread;
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
#ifdef GGML_SIMD
const int64_t pkg_size = GGML_F32_EPR;
const int64_t pkg_count = c / pkg_size;
const int64_t c_pkg_end = pkg_count * pkg_size;
#else
const int64_t c_pkg_end = 0;
#endif
for (int64_t row = row_start; row < row_end; ++row) {
const int64_t dst_y = row % p.dst_h;
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
#ifdef GGML_SIMD
// Vectorized loop
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
sum = GGML_F32_VEC_FMA(sum, k, s);
}
}
GGML_F32_VEC_STORE(dst_data + c_i, sum);
}
#endif
// Scalar loop
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
* src_data[(src_y * p.src_w + src_x) * c + c_i];
}
}
dst_data[c_i] = sum;
}
}
}
}
static void ggml_compute_forward_conv_2d_dw_whcn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t n = p.channels * p.batch;
const int64_t per_thread = (n + params->nth - 1) / params->nth;
const int64_t start = params->ith * per_thread;
const int64_t end = MIN(start + per_thread, n);
for (int64_t i = start; i < end; ++i) {
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[knl_y * p.knl_w + knl_x]
* src_data[src_y * p.src_w + src_x];
}
}
dst_data[dst_y * p.dst_w + dst_x] = sum;
}
}
}
}
void ggml_compute_forward_conv_2d_dw(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * src = dst->src[1];
ggml_conv_2d_dw_params p;
p.channels = src->ne[2];
p.batch = src->ne[3];
p.src_w = src->ne[0];
p.src_h = src->ne[1];
p.dst_w = dst->ne[0];
p.dst_h = dst->ne[1];
p.knl_w = kernel->ne[0];
p.knl_h = kernel->ne[1];
p.stride_x = dst->op_params[0];
p.stride_y = dst->op_params[1];
p.pad_x = dst->op_params[2];
p.pad_y = dst->op_params[3];
p.dilation_x = dst->op_params[4];
p.dilation_y = dst->op_params[5];
GGML_ASSERT(kernel->ne[3] == p.channels);
GGML_ASSERT(dst->ne[3] == p.batch);
if (ggml_is_contiguous(src)) {
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
} else if (ggml_is_contiguous_channels(src)) {
// kernel should also have channels most contiguous in memory
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
} else {
GGML_ABORT("non-contiguous memory layout not supported");
}
}
// ggml_compute_forward_pool_1d_sk_p0
static void ggml_compute_forward_pool_1d_sk_p0(
+1
View File
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+42 -38
View File
@@ -155,25 +155,27 @@ static constexpr __device__ int get_mmq_y_device() {
#define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8}
static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 :
type == GGML_TYPE_Q4_1 ? MMQ_DP4A_TXS_Q4_1 :
type == GGML_TYPE_Q5_0 ? MMQ_DP4A_TXS_Q8_0 :
type == GGML_TYPE_Q5_1 ? MMQ_DP4A_TXS_Q8_1 :
type == GGML_TYPE_Q8_0 ? MMQ_DP4A_TXS_Q8_0 :
type == GGML_TYPE_Q2_K ? MMQ_DP4A_TXS_Q2_K :
type == GGML_TYPE_Q3_K ? MMQ_DP4A_TXS_Q3_K :
type == GGML_TYPE_Q4_K ? MMQ_DP4A_TXS_Q4_K :
type == GGML_TYPE_Q5_K ? MMQ_DP4A_TXS_Q5_K :
type == GGML_TYPE_Q6_K ? MMQ_DP4A_TXS_Q6_K :
type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 :
type == GGML_TYPE_IQ2_XS ? MMQ_DP4A_TXS_Q8_0_16 :
type == GGML_TYPE_IQ2_S ? MMQ_DP4A_TXS_Q8_0_16 :
type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 :
type == GGML_TYPE_IQ3_S ? MMQ_DP4A_TXS_Q8_0 :
type == GGML_TYPE_IQ1_S ? MMQ_DP4A_TXS_Q8_0 :
type == GGML_TYPE_IQ4_XS ? MMQ_DP4A_TXS_Q8_0 :
type == GGML_TYPE_IQ4_NL ? MMQ_DP4A_TXS_Q8_0 :
tile_x_sizes{0, 0, 0};
switch (type) {
case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1;
case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K;
case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K;
case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K;
case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K;
case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K;
case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16;
case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16;
case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0;
default: return tile_x_sizes{0, 0, 0};
}
}
#define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4)
@@ -189,25 +191,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_Q4_1 ? MMQ_MMA_TILE_X_K_Q8_1 :
type == GGML_TYPE_Q5_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_Q5_1 ? MMQ_MMA_TILE_X_K_Q8_1 :
type == GGML_TYPE_Q8_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_Q2_K ? MMQ_MMA_TILE_X_K_Q2_K :
type == GGML_TYPE_Q3_K ? MMQ_MMA_TILE_X_K_Q3_K :
type == GGML_TYPE_Q4_K ? MMQ_MMA_TILE_X_K_Q8_1 :
type == GGML_TYPE_Q5_K ? MMQ_MMA_TILE_X_K_Q8_1 :
type == GGML_TYPE_Q6_K ? MMQ_MMA_TILE_X_K_Q6_K :
type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_IQ2_XS ? MMQ_MMA_TILE_X_K_Q3_K :
type == GGML_TYPE_IQ2_S ? MMQ_MMA_TILE_X_K_Q3_K :
type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_IQ3_S ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_IQ1_S ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_IQ4_XS ? MMQ_MMA_TILE_X_K_Q8_0 :
type == GGML_TYPE_IQ4_NL ? MMQ_MMA_TILE_X_K_Q8_0 :
0;
switch (type) {
case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1;
case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1;
case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K;
case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K;
case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K;
case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0;
default: return 0;
}
}
#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
+42 -38
View File
@@ -7,47 +7,51 @@
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
nullptr;
switch (type) {
case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1;
case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1;
case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1;
case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1;
case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
default: return nullptr;
}
}
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
1;
switch (type) {
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ;
case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ;
case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ;
case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ;
case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
default: return 1;
}
}
enum mmvq_parameter_table_id {
+51 -2
View File
@@ -956,6 +956,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CONV_TRANSPOSE_1D",
"IM2COL",
"IM2COL_BACK",
"CONV_2D_DW",
"CONV_TRANSPOSE_2D",
"POOL_1D",
"POOL_2D",
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"conv_transpose_1d(x)",
"im2col(x)",
"im2col_back(x)",
"conv_2d_dw(x)",
"conv_transpose_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -1344,6 +1346,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
}
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
return
tensor->nb[0] > tensor->nb[2] &&
tensor->nb[1] > tensor->nb[0] &&
tensor->nb[2] == ggml_type_size(tensor->type);
}
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@@ -4050,6 +4059,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
return result;
}
// ggml_conv_2d_dw_direct
struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1) {
GGML_ASSERT(a->ne[2] == 1);
GGML_ASSERT(a->ne[3] == b->ne[2]);
int64_t ne[4];
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
ne[2] = b->ne[2];
ne[3] = b->ne[3];
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
if (ggml_is_contiguous_channels(b)) {
// Result will be permuted the same way as input (CWHN order)
const int64_t type_size = ggml_type_size(result->type);
GGML_ASSERT(ggml_blck_size(result->type) == 1);
result->nb[0] = result->ne[2] * type_size;
result->nb[1] = result->ne[0] * result->nb[0];
result->nb[2] = type_size;
}
int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_2D_DW;
result->src[0] = a;
result->src[1] = b;
return result;
}
// ggml_conv_transpose_2d_p0
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
+1 -1
View File
@@ -112,7 +112,7 @@ You can use GBNF grammars:
- In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
- In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
## JSON Schemas → GBNF
+1 -1
View File
@@ -1 +1 @@
f71d538ece3fb32a04824dc6d1e73e360be9d22f
13bcf9ce50651a8b4238ec6d136f46f2c1b23b6f
+3 -2
View File
@@ -32,8 +32,9 @@ add_library(llama
unicode.h
)
target_include_directories(llama PUBLIC . ../include)
target_compile_features (llama PUBLIC cxx_std_17) # don't bump
target_include_directories(llama PRIVATE .)
target_include_directories(llama PUBLIC ../include)
target_compile_features (llama PRIVATE cxx_std_17) # don't bump
target_link_libraries(llama PUBLIC ggml)
+39 -30
View File
@@ -1,5 +1,17 @@
llama_add_compile_flags()
function(llama_build source)
if (DEFINED LLAMA_TEST_NAME)
set(TEST_TARGET ${LLAMA_TEST_NAME})
else()
get_filename_component(TEST_TARGET ${source} NAME_WE)
endif()
add_executable(${TEST_TARGET} ${source})
target_link_libraries(${TEST_TARGET} PRIVATE common)
install(TARGETS ${TEST_TARGET} RUNTIME)
endfunction()
function(llama_test target)
include(CMakeParseArguments)
set(options)
@@ -36,7 +48,7 @@ endfunction()
# - LABEL: label for the test (defaults to main)
# - ARGS: arguments to pass to the test executable
# - WORKING_DIRECTORY
function(llama_target_and_test source)
function(llama_build_and_test source)
include(CMakeParseArguments)
set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@@ -58,6 +70,7 @@ function(llama_target_and_test source)
add_executable(${TEST_TARGET} ${source} get-model.cpp)
install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE common)
add_test(
NAME ${TEST_TARGET}
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
@@ -68,9 +81,7 @@ function(llama_target_and_test source)
endfunction()
# build test-tokenizer-0 target once and add many tests
add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
target_link_libraries(test-tokenizer-0 PRIVATE common)
install(TARGETS test-tokenizer-0 RUNTIME)
llama_build(test-tokenizer-0.cpp)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
@@ -87,27 +98,27 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
if (LLAMA_LLGUIDANCE)
llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
endif ()
if (NOT WIN32)
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-grammar-integration.cpp)
llama_target_and_test(test-llama-grammar.cpp)
llama_target_and_test(test-chat.cpp)
llama_build_and_test(test-sampling.cpp)
llama_build_and_test(test-grammar-parser.cpp)
llama_build_and_test(test-grammar-integration.cpp)
llama_build_and_test(test-llama-grammar.cpp)
llama_build_and_test(test-chat.cpp)
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
endif()
llama_build(test-quantize-stats.cpp)
llama_build(test-gbnf-validator.cpp)
# build test-tokenizer-1-bpe target once and add many tests
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
install(TARGETS test-tokenizer-1-bpe RUNTIME)
llama_build(test-tokenizer-1-bpe.cpp)
# TODO: disabled due to slowness
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
@@ -120,37 +131,35 @@ if (NOT WIN32)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# build test-tokenizer-1-spm target once and add many tests
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
target_link_libraries(test-tokenizer-1-spm PRIVATE common)
install(TARGETS test-tokenizer-1-spm RUNTIME)
llama_build(test-tokenizer-1-spm.cpp)
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
# llama_target_and_test(test-double-float.cpp) # SLOW
# llama_build_and_test(test-double-float.cpp) # SLOW
endif()
llama_target_and_test(test-log.cpp)
llama_target_and_test(test-chat-template.cpp)
llama_build_and_test(test-log.cpp)
llama_build_and_test(test-chat-template.cpp)
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
if (NOT WIN32)
llama_target_and_test(test-arg-parser.cpp)
llama_build_and_test(test-arg-parser.cpp)
endif()
# llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-gguf.cpp)
llama_target_and_test(test-backend-ops.cpp)
# llama_build_and_test(test-opt.cpp) # SLOW
llama_build_and_test(test-gguf.cpp)
llama_build_and_test(test-backend-ops.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model")
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model")
if (NOT GGML_BACKEND_DL)
# these tests use the backends directly and cannot be built with dynamic loading
llama_target_and_test(test-barrier.cpp)
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-rope.cpp)
llama_build_and_test(test-barrier.cpp)
llama_build_and_test(test-quantize-fns.cpp)
llama_build_and_test(test-quantize-perf.cpp)
llama_build_and_test(test-rope.cpp)
endif()
+3 -2
View File
@@ -11,8 +11,9 @@
#include <string>
#include "chat.h"
#include "llama-grammar.h"
#include "unicode.h"
#include "../src/unicode.h"
#include "../src/llama-grammar.h"
using json = nlohmann::ordered_json;
@@ -1,5 +1,5 @@
#include "unicode.h"
#include "llama-grammar.h"
#include "../src/unicode.h"
#include "../src/llama-grammar.h"
#include <cstdio>
#include <cstdlib>
+3 -2
View File
@@ -2,10 +2,11 @@
#undef NDEBUG
#endif
#include "unicode.h"
#include "llama-grammar.h"
#include "json-schema-to-grammar.h"
#include "../src/unicode.h"
#include "../src/llama-grammar.h"
#include <cassert>
#include <string>
#include <vector>
+1 -2
View File
@@ -2,7 +2,6 @@
# undef NDEBUG
#endif
#include "unicode.h"
#include "sampling.h"
#include <cassert>
@@ -84,7 +83,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
fprintf(stderr,
"\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
"command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf "
"command: ./test-gbnf-validator test-grammar-integration.grammar.gbnf "
"test-grammar-integration.string.txt\n\n");
} else {
fprintf(stdout, "✅︎\n");
+3 -1
View File
@@ -3,7 +3,9 @@
#endif
#include "llama.h"
#include "llama-grammar.h"
// TODO: shold not include libllama sources
#include "../src/llama-grammar.h"
#include <cassert>
+1 -1
View File
@@ -4,7 +4,7 @@
#include "json-schema-to-grammar.h"
#include "llama-grammar.h"
#include "../src/llama-grammar.h"
#include <cassert>
#include <fstream>
+2 -1
View File
@@ -3,7 +3,8 @@
#endif
#include "llama.h"
#include "llama-grammar.h"
#include "../src/llama-grammar.h"
#include <cassert>
#include <stdexcept>
@@ -1,8 +1,9 @@
#include "ggml.h"
#include "llama.h"
#include "llama-model.h"
#include "common.h"
#include "../src/llama-model.h"
#include <algorithm>
#include <cassert>
#include <cinttypes>
+2 -1
View File
@@ -1,8 +1,9 @@
#include "llama.h"
#include "common.h"
#include "unicode.h"
#include "console.h"
#include "../src/unicode.h"
#include <cassert>
#include <codecvt>
#include <cstdio>
+2 -1
View File
@@ -1,8 +1,9 @@
#include "llama.h"
#include "common.h"
#include "unicode.h"
#include "console.h"
#include "../src/unicode.h"
#include <cassert>
#include <codecvt>
#include <cstdio>