Compare commits

...

4 Commits

Author SHA1 Message Date
Kyle Mistele 39baaf55a1 docker : add server-first container images (#5157)
* feat: add Dockerfiles for each platform that user ./server instead of ./main

* feat: update .github/workflows/docker.yml to build server-first docker containers

* doc: add information about running the server with Docker to README.md

* doc: add information about running with docker to the server README

* doc: update n-gpu-layers to show correct GPU usage

* fix(doc): update container tag from `server` to `server-cuda` for README example on running server container with CUDA
2024-01-28 09:55:31 +02:00
John 6db2b41a76 llava : support for Yi-VL and fix for mobileVLM (#5093)
* Support for Yi-VL, templating fix for mobileVLM

* ws

* Update examples/llava/clip.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update llava-cli.cpp

* Update clip.cpp

bugfix for new conversions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-01-27 17:09:18 +02:00
Georgi Gerganov 753eafed0e sync : ggml 2024-01-27 17:00:24 +02:00
Judd e976423005 ggml : check ggml_add src1 type (ggml/708)
Co-authored-by: Judd <foldl@boxvest.com>
2024-01-27 16:59:00 +02:00
12 changed files with 247 additions and 13 deletions
+32
View File
@@ -0,0 +1,32 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
COPY --from=build /app/server /server
ENTRYPOINT [ "/server" ]
+25
View File
@@ -0,0 +1,25 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
cmake --build . --config Release --target main server
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]
+45
View File
@@ -0,0 +1,45 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT [ "/app/server" ]
+20
View File
@@ -0,0 +1,20 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git
WORKDIR /app
COPY . .
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/server" ]
+4
View File
@@ -28,14 +28,18 @@ jobs:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
+13 -1
View File
@@ -931,17 +931,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
* Create a folder to store big models & intermediate files (ex. /llama/models)
#### Images
We have two Docker images available for this project:
We have three Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
@@ -967,6 +970,12 @@ or with a light image:
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
```
or with a server image:
```bash
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
```
### Docker With CUDA
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@@ -976,6 +985,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
```bash
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@@ -989,6 +999,7 @@ The resulting images, are essentially the same as the non-CUDA images:
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
#### Usage
@@ -997,6 +1008,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
```bash
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
```
### Contributing
+63 -7
View File
@@ -98,6 +98,7 @@ static std::string format(const char * fmt, ...) {
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -304,10 +305,18 @@ struct clip_vision_model {
struct ggml_tensor * projection;
// LLaVA projection
struct ggml_tensor * mm_0_w;
struct ggml_tensor * mm_0_b;
struct ggml_tensor * mm_2_w;
struct ggml_tensor * mm_2_b;
struct ggml_tensor * mm_0_w = NULL;
struct ggml_tensor * mm_0_b = NULL;
struct ggml_tensor * mm_2_w = NULL;
struct ggml_tensor * mm_2_b = NULL;
// Yi type models with mlp+normalization projection
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
struct ggml_tensor * mm_1_b = NULL;
struct ggml_tensor * mm_3_w = NULL;
struct ggml_tensor * mm_3_b = NULL;
struct ggml_tensor * mm_4_w = NULL;
struct ggml_tensor * mm_4_b = NULL;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w;
@@ -460,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// pre-layernorm
{
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
}
@@ -575,6 +585,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
// First LayerNorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
model.mm_1_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
// Second LayerNorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
model.mm_4_b);
}
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
@@ -808,6 +839,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
else {
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
}
}
}
#ifdef GGML_USE_CUBLAS
@@ -956,11 +992,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
// LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
try {
// Yi-type llava
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
} catch (std::runtime_error & e) { }
try {
// missing in Yi-type llava
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
} catch (std::runtime_error & e) { }
try {
// Yi-type llava
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
} catch (std::runtime_error & e) { }
try {
// Yi-type llava
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
} catch (std::runtime_error & e) { }
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
@@ -1432,6 +1486,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
return ctx->vision_model.mm_3_b->ne[0];
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+29 -3
View File
@@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<image>");
if (image_pos != std::string::npos) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
// We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
size_t pos = 0;
while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
user_prompt.replace(pos, 2, "\n");
pos += 1; // Advance past the replaced newline
}
while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
system_prompt.replace(pos, 2, "\n");
pos += 1; // Advance past the replaced newline
}
printf("system_prompt: %s\n", system_prompt.c_str());
printf("user_prompt: %s\n", user_prompt.c_str());
} else {
// llava-1.5 native mode
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:";
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
// generate the response
@@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
fflush(stdout);
+8
View File
@@ -66,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048
The above command will start a server that by default listens on `127.0.0.1:8080`.
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
### Docker:
```bash
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
# or, with CUDA:
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
```
## Testing with CURL
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
+1
View File
@@ -249,6 +249,7 @@ struct llama_server_queue {
}
// Start the main loop. This call is blocking
[[noreturn]]
void start_loop() {
while (true) {
// new task arrived
+6 -1
View File
@@ -7498,7 +7498,12 @@ static void ggml_compute_forward_add(
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_add_f32(params, src0, src1, dst);
if (src1->type == GGML_TYPE_F32) {
ggml_compute_forward_add_f32(params, src0, src1, dst);
}
else {
GGML_ASSERT(false);
}
} break;
case GGML_TYPE_F16:
{
+1 -1
View File
@@ -1 +1 @@
6c1ce0bd591a430c1d3f6797d905194581c878c1
c2448f88d17395452a587d0176d19ed87e0f7ce1