mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-29 09:07:41 +02:00
Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6f67cf1f48 | |||
| 16a457facd | |||
| 3e168bede4 | |||
| ceda28ef8e | |||
| 3b127c7385 | |||
| e5007a5edf | |||
| 416313773b | |||
| 07c2e2f76c | |||
| 44cd8d91ff | |||
| 5933e6fdc9 | |||
| da84c04d8f | |||
| a0f7016d17 | |||
| 19e899ce21 | |||
| e2e1ddb93a | |||
| d9d398f84f | |||
| 5a63980117 | |||
| cdf76586b2 | |||
| 7d3af70b08 | |||
| 00e3e5a194 | |||
| e98b3692be | |||
| b6ce7430b7 | |||
| 5f5e39e1ba | |||
| eaea325324 | |||
| 43ddab6eee | |||
| 1831f538f7 | |||
| 4e87962e34 | |||
| fb0471d175 | |||
| d2b2031e5f | |||
| 5fa9e63be8 |
@@ -14,9 +14,9 @@ WORKDIR /app
|
||||
COPY . .
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
else \
|
||||
echo "Unsupported architecture"; \
|
||||
exit 1; \
|
||||
|
||||
@@ -21,7 +21,7 @@ COPY . .
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||
|
||||
RUN echo "Building with static libs" && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
# TODO: use image with NNRT
|
||||
|
||||
@@ -35,7 +35,7 @@ COPY . .
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -40,7 +40,7 @@ WORKDIR /app
|
||||
COPY . .
|
||||
|
||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
|
||||
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
|
||||
&& cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib \
|
||||
|
||||
@@ -16,7 +16,7 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
## Hot topics
|
||||
|
||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
|
||||
+102
-56
@@ -43,6 +43,25 @@ std::initializer_list<enum llama_example> mmproj_examples = {
|
||||
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
|
||||
};
|
||||
|
||||
static std::string read_file(const std::string & fname) {
|
||||
std::ifstream file(fname);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
||||
}
|
||||
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
file.close();
|
||||
return content;
|
||||
}
|
||||
|
||||
static void write_file(const std::string & fname, const std::string & content) {
|
||||
std::ofstream file(fname);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
|
||||
}
|
||||
file << content;
|
||||
file.close();
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
this->examples = std::move(examples);
|
||||
return *this;
|
||||
@@ -200,9 +219,11 @@ struct curl_slist_ptr {
|
||||
|
||||
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
||||
int remaining_attempts = max_attempts;
|
||||
char * method = nullptr;
|
||||
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
|
||||
|
||||
while (remaining_attempts > 0) {
|
||||
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res == CURLE_OK) {
|
||||
@@ -213,6 +234,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
||||
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
||||
|
||||
remaining_attempts--;
|
||||
if (remaining_attempts == 0) break;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||
}
|
||||
|
||||
@@ -231,8 +253,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
return false;
|
||||
}
|
||||
|
||||
bool force_download = false;
|
||||
|
||||
// Set the URL, allow to follow http redirection
|
||||
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
|
||||
@@ -256,7 +276,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata;
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
|
||||
@@ -266,14 +286,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
||||
auto previous_url = metadata.at("url").get<std::string>();
|
||||
if (previous_url != url) {
|
||||
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
@@ -281,10 +294,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
return false;
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
@@ -296,7 +309,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
|
||||
// get ETag to see if the remote file has changed
|
||||
{
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
@@ -325,23 +341,28 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
// we only allow retrying once for HEAD requests
|
||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code != 200) {
|
||||
// HEAD not supported, we don't know if the file has changed
|
||||
// force trigger downloading
|
||||
force_download = true;
|
||||
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_download = !file_exists || force_download;
|
||||
if (!should_download) {
|
||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||
// we leave should_download as-is, which is true if the file does not exist
|
||||
if (head_request_ok) {
|
||||
// check if ETag or Last-Modified headers are different
|
||||
// if it is, we need to download the file again
|
||||
if (!etag.empty() && etag != headers.etag) {
|
||||
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
||||
should_download = true;
|
||||
@@ -350,6 +371,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
should_download = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (should_download) {
|
||||
std::string path_temporary = path + ".downloadInProgress";
|
||||
if (file_exists) {
|
||||
@@ -424,13 +446,15 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
{"etag", headers.etag},
|
||||
{"lastModified", headers.last_modified}
|
||||
});
|
||||
std::ofstream(metadata_path) << metadata.dump(4);
|
||||
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
write_file(metadata_path, metadata.dump(4));
|
||||
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||
|
||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -605,16 +629,37 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
||||
// User-Agent header is already set in common_remote_get_content, no need to set it here
|
||||
|
||||
// we use "=" to avoid clashing with other component, while still being allowed on windows
|
||||
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
|
||||
string_replace_all(cached_response_fname, "/", "_");
|
||||
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
|
||||
|
||||
// make the request
|
||||
common_remote_params params;
|
||||
params.headers = headers;
|
||||
auto res = common_remote_get_content(url, params);
|
||||
long res_code = res.first;
|
||||
std::string res_str(res.second.data(), res.second.size());
|
||||
long res_code = 0;
|
||||
std::string res_str;
|
||||
bool use_cache = false;
|
||||
try {
|
||||
auto res = common_remote_get_content(url, params);
|
||||
res_code = res.first;
|
||||
res_str = std::string(res.second.data(), res.second.size());
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
||||
LOG_WRN("try reading from cache\n");
|
||||
// try to read from cache
|
||||
try {
|
||||
res_str = read_file(cached_response_path);
|
||||
res_code = 200;
|
||||
use_cache = true;
|
||||
} catch (const std::exception & e) {
|
||||
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
||||
}
|
||||
}
|
||||
std::string ggufFile;
|
||||
std::string mmprojFile;
|
||||
|
||||
if (res_code == 200) {
|
||||
if (res_code == 200 || res_code == 304) {
|
||||
// extract ggufFile.rfilename in json, using regex
|
||||
{
|
||||
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
|
||||
@@ -631,6 +676,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
mmprojFile = match[1].str();
|
||||
}
|
||||
}
|
||||
if (!use_cache) {
|
||||
// if not using cached response, update the cache file
|
||||
write_file(cached_response_path, res_str);
|
||||
}
|
||||
} else if (res_code == 401) {
|
||||
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
||||
} else {
|
||||
@@ -1142,6 +1191,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
ctx_arg.params = params_org;
|
||||
return false;
|
||||
} catch (std::exception & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
exit(1); // for other exceptions, we exit with status code 1
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1442,13 +1494,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-f", "--file"}, "FNAME",
|
||||
"a file containing the prompt (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
params.prompt = read_file(value);
|
||||
// store the external file name in params
|
||||
params.prompt_file = value;
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
@@ -1458,11 +1506,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-sysf", "--system-prompt-file"}, "FNAME",
|
||||
"a file containing the system prompt (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
||||
params.system_prompt = read_file(value);
|
||||
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
||||
params.system_prompt.pop_back();
|
||||
}
|
||||
@@ -1887,15 +1931,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--grammar-file"}, "FNAME",
|
||||
"file to read grammar from",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.sampling.grammar)
|
||||
);
|
||||
params.sampling.grammar = read_file(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1905,6 +1941,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"-jf", "--json-schema-file"}, "FILE",
|
||||
"File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::string schema;
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(schema)
|
||||
);
|
||||
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--pooling"}, "{none,mean,cls,last,rank}",
|
||||
"pooling type for embeddings, use model default if unspecified",
|
||||
@@ -2815,14 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.chat_template));
|
||||
params.chat_template = read_file(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
||||
add_opt(common_arg(
|
||||
|
||||
+186
-127
@@ -16,6 +16,7 @@ from pathlib import Path
|
||||
from hashlib import sha256
|
||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
|
||||
from itertools import chain
|
||||
from transformers import AutoConfig
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
@@ -66,8 +67,6 @@ class ModelBase:
|
||||
part_names: list[str]
|
||||
is_safetensors: bool
|
||||
hparams: dict[str, Any]
|
||||
block_count: int
|
||||
tensor_map: gguf.TensorNameMap
|
||||
tensor_names: set[str] | None
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
model_name: str | None
|
||||
@@ -78,7 +77,11 @@ class ModelBase:
|
||||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||
# subclasses should initialize this!
|
||||
block_count: int
|
||||
tensor_map: gguf.TensorNameMap
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||
@@ -113,8 +116,6 @@ class ModelBase:
|
||||
if not self.is_safetensors:
|
||||
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||
self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
self.tensor_names = None
|
||||
self.metadata_override = metadata_override
|
||||
self.model_name = model_name
|
||||
@@ -417,15 +418,13 @@ class ModelBase:
|
||||
|
||||
@staticmethod
|
||||
def load_hparams(dir_model: Path):
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
architectures = hparams.get("architectures")
|
||||
if "text_config" in hparams:
|
||||
hparams = {**hparams, **hparams["text_config"]}
|
||||
if architectures is not None:
|
||||
# preserve "architectures" from root level config
|
||||
hparams["architectures"] = architectures
|
||||
return hparams
|
||||
try:
|
||||
return AutoConfig.from_pretrained(dir_model).to_dict()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
||||
logger.warning("Trying to load config.json instead")
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@classmethod
|
||||
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
||||
@@ -454,6 +453,16 @@ class ModelBase:
|
||||
|
||||
|
||||
class TextModel(ModelBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if "text_config" in self.hparams:
|
||||
# move the text_config to the root level
|
||||
self.hparams = {**self.hparams, **self.hparams["text_config"]}
|
||||
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
@classmethod
|
||||
def __init_subclass__(cls):
|
||||
# can't use an abstract property, because overriding it without type errors
|
||||
@@ -1077,9 +1086,9 @@ class VisionModel(ModelBase):
|
||||
if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
|
||||
raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
|
||||
|
||||
# small hack to correct the number of layers
|
||||
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128)
|
||||
self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"])
|
||||
# get n_embd of the text model
|
||||
text_config = {**self.hparams, **self.hparams["text_config"]}
|
||||
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
|
||||
if "vision_config" not in self.hparams:
|
||||
@@ -1088,6 +1097,9 @@ class VisionModel(ModelBase):
|
||||
self.global_config = self.hparams
|
||||
self.hparams = self.hparams["vision_config"]
|
||||
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
|
||||
|
||||
# load preprocessor config
|
||||
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
||||
self.preprocessor_config = json.load(f)
|
||||
@@ -1105,12 +1117,12 @@ class VisionModel(ModelBase):
|
||||
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
|
||||
self.gguf_writer.add_vision_block_count(self.block_count)
|
||||
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
|
||||
|
||||
# preprocessor config
|
||||
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
|
||||
|
||||
def write_vocab(self):
|
||||
raise ValueError("VisionModel does not support vocab writing")
|
||||
@@ -1726,23 +1738,12 @@ class StableLMModel(TextModel):
|
||||
"LlamaForCausalLM",
|
||||
"MistralForCausalLM",
|
||||
"MixtralForCausalLM",
|
||||
"Idefics3ForConditionalGeneration",
|
||||
"SmolVLMForConditionalGeneration",
|
||||
"VLlama3ForCausalLM",
|
||||
"LlavaForConditionalGeneration")
|
||||
class LlamaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
undo_permute = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
# fix for Pixtral, missing `num_attention_heads` in config.json
|
||||
if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \
|
||||
and self.hparams.get("model_type") == "mistral":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
@@ -1905,11 +1906,7 @@ class LlavaVisionModel(VisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if self.hparams["model_type"] == "pixtral":
|
||||
# fix missing config.json values
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
|
||||
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
|
||||
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
|
||||
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
|
||||
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
|
||||
self.img_break_tok_id = 12 # see tokenizer_config.json
|
||||
else:
|
||||
@@ -1920,7 +1917,6 @@ class LlavaVisionModel(VisionModel):
|
||||
hparams = self.hparams
|
||||
if hparams["model_type"] == "pixtral":
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
|
||||
@@ -1951,13 +1947,12 @@ class LlavaVisionModel(VisionModel):
|
||||
class SmolVLMModel(VisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# fix for SmolVLM2, missing some keys in config.json
|
||||
# default values are taken from transformers code
|
||||
if self.hparams["model_type"] == "smolvlm_vision":
|
||||
# fix for SmolVLM2, missing some keys in config.json
|
||||
# default values are taken from transformers code
|
||||
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
|
||||
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
@@ -3373,14 +3368,7 @@ class BertModel(TextModel):
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("RobertaModel")
|
||||
class RobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _xlmroberta_tokenizer_init(self) -> None:
|
||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||
self._position_offset = 1 + pad_token_id
|
||||
@@ -3389,82 +3377,7 @@ class RobertaModel(BertModel):
|
||||
else:
|
||||
self._position_offset = None
|
||||
|
||||
def set_vocab(self):
|
||||
"""Support BPE tokenizers for roberta models"""
|
||||
bpe_tok_path = self.dir_model / "tokenizer.json"
|
||||
if bpe_tok_path.exists():
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
# we need this to validate the size of the token_type embeddings
|
||||
# though currently we are passing all zeros to the token_type embeddings
|
||||
# "Sequence A" or "Sequence B"
|
||||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||
|
||||
else:
|
||||
return super().set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "roberta.", remove the prefix
|
||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||
if name.startswith("roberta."):
|
||||
name = name[8:]
|
||||
|
||||
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
||||
if name == "embeddings.position_embeddings.weight":
|
||||
if self._position_offset is not None:
|
||||
data_torch = data_torch[self._position_offset:,:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("NomicBertModel")
|
||||
class NomicBertModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||
self.hparams["n_ctx"] = 2048
|
||||
|
||||
# SwigLU activation
|
||||
assert self.hparams["activation_function"] == "swiglu"
|
||||
# this doesn't do anything in the HF version
|
||||
assert self.hparams["causal"] is False
|
||||
# no bias tensors
|
||||
assert self.hparams["qkv_proj_bias"] is False
|
||||
assert self.hparams["mlp_fc1_bias"] is False
|
||||
assert self.hparams["mlp_fc2_bias"] is False
|
||||
# norm at end of layer
|
||||
assert self.hparams["prenorm"] is False
|
||||
# standard RoPE
|
||||
assert self.hparams["rotary_emb_fraction"] == 1.0
|
||||
assert self.hparams["rotary_emb_interleaved"] is False
|
||||
assert self.hparams["rotary_emb_scale_base"] is None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||
self._position_offset = 1 + pad_token_id
|
||||
if "max_position_embeddings" in self.hparams:
|
||||
self.hparams["max_position_embeddings"] -= self._position_offset
|
||||
else:
|
||||
self._position_offset = None
|
||||
|
||||
def set_vocab(self):
|
||||
def _xlmroberta_set_vocab(self) -> None:
|
||||
# to avoid TypeError: Descriptors cannot be created directly
|
||||
# exception when importing sentencepiece_model_pb2
|
||||
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||
@@ -3546,6 +3459,140 @@ class XLMRobertaModel(BertModel):
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
|
||||
@ModelBase.register("RobertaModel")
|
||||
class RobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||
self._position_offset = 1 + pad_token_id
|
||||
if "max_position_embeddings" in self.hparams:
|
||||
self.hparams["max_position_embeddings"] -= self._position_offset
|
||||
else:
|
||||
self._position_offset = None
|
||||
|
||||
def set_vocab(self):
|
||||
"""Support BPE tokenizers for roberta models"""
|
||||
bpe_tok_path = self.dir_model / "tokenizer.json"
|
||||
if bpe_tok_path.exists():
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
# we need this to validate the size of the token_type embeddings
|
||||
# though currently we are passing all zeros to the token_type embeddings
|
||||
# "Sequence A" or "Sequence B"
|
||||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||
|
||||
else:
|
||||
return super().set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "roberta.", remove the prefix
|
||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||
if name.startswith("roberta."):
|
||||
name = name[8:]
|
||||
|
||||
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
||||
if name == "embeddings.position_embeddings.weight":
|
||||
if self._position_offset is not None:
|
||||
data_torch = data_torch[self._position_offset:,:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("NomicBertModel")
|
||||
class NomicBertModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
|
||||
hparams = kwargs.pop("hparams", None)
|
||||
if hparams is None:
|
||||
hparams = ModelBase.load_hparams(dir_model)
|
||||
|
||||
self.is_moe = bool(hparams.get("moe_every_n_layers"))
|
||||
self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
|
||||
|
||||
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
|
||||
|
||||
self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
|
||||
if self._tokenizer_is_xlmroberta:
|
||||
self._xlmroberta_tokenizer_init()
|
||||
|
||||
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||
self.hparams["n_ctx"] = 2048
|
||||
|
||||
assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
|
||||
|
||||
# this doesn't do anything in the HF version
|
||||
assert self.hparams["causal"] is False
|
||||
# no bias tensors unless MoE
|
||||
assert self.hparams["qkv_proj_bias"] == self.is_moe
|
||||
assert self.hparams["mlp_fc1_bias"] == self.is_moe
|
||||
assert self.hparams["mlp_fc2_bias"] == self.is_moe
|
||||
|
||||
# norm at end of layer
|
||||
assert self.hparams["prenorm"] is False
|
||||
# standard RoPE
|
||||
assert self.hparams["rotary_emb_fraction"] == 1.0
|
||||
assert self.hparams["rotary_emb_interleaved"] is False
|
||||
assert self.hparams["rotary_emb_scale_base"] is None
|
||||
|
||||
def set_vocab(self) -> None:
|
||||
if self._tokenizer_is_xlmroberta:
|
||||
return self._xlmroberta_set_vocab()
|
||||
return super().set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
|
||||
# If the tensor is an experts bias tensor, skip it by returning an empty list.
|
||||
if "mlp.experts.bias" in name:
|
||||
return [] # Explicitly return an empty list.
|
||||
|
||||
if "mlp.experts.mlp.w1" in name:
|
||||
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
||||
name += ".weight"
|
||||
|
||||
if "mlp.experts.mlp.w2" in name:
|
||||
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
||||
data_torch = data_torch.transpose(1, 2)
|
||||
name += ".weight"
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||
if self.is_moe:
|
||||
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
||||
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
|
||||
|
||||
def _is_tokenizer_xlmroberta(self) -> bool:
|
||||
with open(self.dir_model / "tokenizer.json") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
toktyp = tokenizer_json["model"]["type"]
|
||||
if toktyp == "Unigram":
|
||||
return True
|
||||
if toktyp == "WordPiece":
|
||||
return False
|
||||
raise ValueError(f"unknown tokenizer: {toktyp}")
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._xlmroberta_tokenizer_init()
|
||||
|
||||
def set_vocab(self):
|
||||
self._xlmroberta_set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "roberta.", remove the prefix
|
||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||
@@ -5806,6 +5853,19 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
||||
return n
|
||||
|
||||
|
||||
def get_model_architecture(dir_model: Path, model_type: ModelType, hparams: Any = None) -> str:
|
||||
hparams = ModelBase.load_hparams(dir_model) if hparams is None else hparams
|
||||
text_config = hparams.get("text_config", {})
|
||||
vision_config = hparams.get("vision_config", {})
|
||||
arch = hparams["architectures"][0]
|
||||
# if "architectures" is found in the sub-config, use that instead
|
||||
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
||||
arch = text_config["architectures"][0]
|
||||
elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
|
||||
arch = vision_config["architectures"][0]
|
||||
return arch
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
@@ -5858,16 +5918,15 @@ def main() -> None:
|
||||
|
||||
logger.info(f"Loading model: {dir_model.name}")
|
||||
|
||||
hparams = ModelBase.load_hparams(dir_model)
|
||||
|
||||
if args.mmproj:
|
||||
if "mmproj" not in fname_out.name:
|
||||
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
|
||||
|
||||
with torch.inference_mode():
|
||||
output_type = ftype_map[args.outtype]
|
||||
model_architecture = hparams["architectures"][0]
|
||||
model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
|
||||
model_architecture = get_model_architecture(dir_model, model_type)
|
||||
logger.info(f"Model architecture: {model_architecture}")
|
||||
try:
|
||||
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
|
||||
except NotImplementedError:
|
||||
|
||||
@@ -28,6 +28,7 @@ options:
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-pg <pp,tg> (default: )
|
||||
-d, --n-depth <n> (default: 0)
|
||||
-b, --batch-size <n> (default: 2048)
|
||||
-ub, --ubatch-size <n> (default: 512)
|
||||
-ctk, --cache-type-k <t> (default: f16)
|
||||
@@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
|
||||
|
||||
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
|
||||
|
||||
Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
|
||||
|
||||
For a description of the other options, see the [main example](../main/README.md).
|
||||
|
||||
Note:
|
||||
@@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
|
||||
|
||||
### Different prefilled context
|
||||
|
||||
```
|
||||
$ ./llama-bench -d 0,512
|
||||
```
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
|
||||
|
||||
## Output formats
|
||||
|
||||
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
|
||||
@@ -170,9 +186,9 @@ $ ./llama-bench -o csv
|
||||
```
|
||||
|
||||
```csv
|
||||
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
|
||||
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
|
||||
```
|
||||
|
||||
### JSON
|
||||
@@ -184,64 +200,78 @@ $ ./llama-bench -o json
|
||||
```json
|
||||
[
|
||||
{
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
|
||||
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
|
||||
"model_filename": "models/7B/ggml-model-q4_0.gguf",
|
||||
"model_type": "llama 7B mostly Q4_0",
|
||||
"model_size": 3825065984,
|
||||
"model_n_params": 6738415616,
|
||||
"n_batch": 512,
|
||||
"n_threads": 16,
|
||||
"f16_kv": true,
|
||||
"build_commit": "8cf427ff",
|
||||
"build_number": 5163,
|
||||
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
|
||||
"gpu_info": "NVIDIA GeForce RTX 4080",
|
||||
"backends": "CUDA",
|
||||
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
|
||||
"model_type": "qwen2 7B Q4_K - Medium",
|
||||
"model_size": 4677120000,
|
||||
"model_n_params": 7615616512,
|
||||
"n_batch": 2048,
|
||||
"n_ubatch": 512,
|
||||
"n_threads": 8,
|
||||
"cpu_mask": "0x0",
|
||||
"cpu_strict": false,
|
||||
"poll": 50,
|
||||
"type_k": "f16",
|
||||
"type_v": "f16",
|
||||
"n_gpu_layers": 99,
|
||||
"split_mode": "layer",
|
||||
"main_gpu": 0,
|
||||
"mul_mat_q": true,
|
||||
"no_kv_offload": false,
|
||||
"flash_attn": false,
|
||||
"tensor_split": "0.00",
|
||||
"use_mmap": true,
|
||||
"embeddings": false,
|
||||
"n_prompt": 512,
|
||||
"n_gen": 0,
|
||||
"test_time": "2023-09-23T12:09:57Z",
|
||||
"avg_ns": 212365953,
|
||||
"stddev_ns": 985423,
|
||||
"avg_ts": 2410.974041,
|
||||
"stddev_ts": 11.163766,
|
||||
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
|
||||
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
|
||||
"n_depth": 0,
|
||||
"test_time": "2025-04-24T11:58:50Z",
|
||||
"avg_ns": 72135640,
|
||||
"stddev_ns": 1453752,
|
||||
"avg_ts": 7100.002165,
|
||||
"stddev_ts": 140.341520,
|
||||
"samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
|
||||
"samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
|
||||
},
|
||||
{
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
|
||||
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
|
||||
"model_filename": "models/7B/ggml-model-q4_0.gguf",
|
||||
"model_type": "llama 7B mostly Q4_0",
|
||||
"model_size": 3825065984,
|
||||
"model_n_params": 6738415616,
|
||||
"n_batch": 512,
|
||||
"n_threads": 16,
|
||||
"f16_kv": true,
|
||||
"build_commit": "8cf427ff",
|
||||
"build_number": 5163,
|
||||
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
|
||||
"gpu_info": "NVIDIA GeForce RTX 4080",
|
||||
"backends": "CUDA",
|
||||
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
|
||||
"model_type": "qwen2 7B Q4_K - Medium",
|
||||
"model_size": 4677120000,
|
||||
"model_n_params": 7615616512,
|
||||
"n_batch": 2048,
|
||||
"n_ubatch": 512,
|
||||
"n_threads": 8,
|
||||
"cpu_mask": "0x0",
|
||||
"cpu_strict": false,
|
||||
"poll": 50,
|
||||
"type_k": "f16",
|
||||
"type_v": "f16",
|
||||
"n_gpu_layers": 99,
|
||||
"split_mode": "layer",
|
||||
"main_gpu": 0,
|
||||
"mul_mat_q": true,
|
||||
"no_kv_offload": false,
|
||||
"flash_attn": false,
|
||||
"tensor_split": "0.00",
|
||||
"use_mmap": true,
|
||||
"embeddings": false,
|
||||
"n_prompt": 0,
|
||||
"n_gen": 128,
|
||||
"test_time": "2023-09-23T12:09:59Z",
|
||||
"avg_ns": 977425219,
|
||||
"stddev_ns": 9268593,
|
||||
"avg_ts": 130.965708,
|
||||
"stddev_ts": 1.238924,
|
||||
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
|
||||
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
|
||||
"n_depth": 0,
|
||||
"test_time": "2025-04-24T11:58:51Z",
|
||||
"avg_ns": 1076767880,
|
||||
"stddev_ns": 9449585,
|
||||
"avg_ts": 118.881588,
|
||||
"stddev_ts": 1.041811,
|
||||
"samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
|
||||
"samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
|
||||
}
|
||||
]
|
||||
```
|
||||
@@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
|
||||
```
|
||||
|
||||
```json lines
|
||||
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
|
||||
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
|
||||
```
|
||||
|
||||
|
||||
@@ -271,25 +301,32 @@ $ ./llama-bench -o sql
|
||||
CREATE TABLE IF NOT EXISTS test (
|
||||
build_commit TEXT,
|
||||
build_number INTEGER,
|
||||
cuda INTEGER,
|
||||
metal INTEGER,
|
||||
gpu_blas INTEGER,
|
||||
blas INTEGER,
|
||||
cpu_info TEXT,
|
||||
gpu_info TEXT,
|
||||
backends TEXT,
|
||||
model_filename TEXT,
|
||||
model_type TEXT,
|
||||
model_size INTEGER,
|
||||
model_n_params INTEGER,
|
||||
n_batch INTEGER,
|
||||
n_ubatch INTEGER,
|
||||
n_threads INTEGER,
|
||||
f16_kv INTEGER,
|
||||
cpu_mask TEXT,
|
||||
cpu_strict INTEGER,
|
||||
poll INTEGER,
|
||||
type_k TEXT,
|
||||
type_v TEXT,
|
||||
n_gpu_layers INTEGER,
|
||||
split_mode TEXT,
|
||||
main_gpu INTEGER,
|
||||
mul_mat_q INTEGER,
|
||||
no_kv_offload INTEGER,
|
||||
flash_attn INTEGER,
|
||||
tensor_split TEXT,
|
||||
use_mmap INTEGER,
|
||||
embeddings INTEGER,
|
||||
n_prompt INTEGER,
|
||||
n_gen INTEGER,
|
||||
n_depth INTEGER,
|
||||
test_time TEXT,
|
||||
avg_ns INTEGER,
|
||||
stddev_ns INTEGER,
|
||||
@@ -297,6 +334,6 @@ CREATE TABLE IF NOT EXISTS test (
|
||||
stddev_ts REAL
|
||||
);
|
||||
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
|
||||
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
|
||||
```
|
||||
|
||||
@@ -200,6 +200,7 @@ struct cmd_params {
|
||||
std::vector<int> n_prompt;
|
||||
std::vector<int> n_gen;
|
||||
std::vector<std::pair<int, int>> n_pg;
|
||||
std::vector<int> n_depth;
|
||||
std::vector<int> n_batch;
|
||||
std::vector<int> n_ubatch;
|
||||
std::vector<ggml_type> type_k;
|
||||
@@ -233,6 +234,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* n_prompt */ { 512 },
|
||||
/* n_gen */ { 128 },
|
||||
/* n_pg */ {},
|
||||
/* n_depth */ { 0 },
|
||||
/* n_batch */ { 2048 },
|
||||
/* n_ubatch */ { 512 },
|
||||
/* type_k */ { GGML_TYPE_F16 },
|
||||
@@ -272,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -pg <pp,tg> (default: %s)\n",
|
||||
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
|
||||
printf(" -b, --batch-size <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_batch, ",").c_str());
|
||||
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
||||
@@ -409,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
||||
} else if (arg == "-d" || arg == "--n-depth") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -739,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.n_pg.empty()) {
|
||||
params.n_pg = cmd_params_defaults.n_pg;
|
||||
}
|
||||
if (params.n_depth.empty()) {
|
||||
params.n_depth = cmd_params_defaults.n_depth;
|
||||
}
|
||||
if (params.n_batch.empty()) {
|
||||
params.n_batch = cmd_params_defaults.n_batch;
|
||||
}
|
||||
@@ -801,6 +814,7 @@ struct cmd_params_instance {
|
||||
std::string model;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
int n_batch;
|
||||
int n_ubatch;
|
||||
ggml_type type_k;
|
||||
@@ -880,7 +894,7 @@ struct cmd_params_instance {
|
||||
llama_context_params to_llama_cparams() const {
|
||||
llama_context_params cparams = llama_context_default_params();
|
||||
|
||||
cparams.n_ctx = n_prompt + n_gen;
|
||||
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
||||
cparams.n_batch = n_batch;
|
||||
cparams.n_ubatch = n_ubatch;
|
||||
cparams.type_k = type_k;
|
||||
@@ -916,6 +930,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & nt : params.n_threads)
|
||||
for (const auto & cm : params.cpu_mask)
|
||||
for (const auto & cs : params.cpu_strict)
|
||||
for (const auto & nd : params.n_depth)
|
||||
for (const auto & pl : params.poll) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
@@ -925,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .model = */ m,
|
||||
/* .n_prompt = */ n_prompt,
|
||||
/* .n_gen = */ 0,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
@@ -955,6 +971,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .model = */ m,
|
||||
/* .n_prompt = */ 0,
|
||||
/* .n_gen = */ n_gen,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
@@ -985,6 +1002,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .model = */ m,
|
||||
/* .n_prompt = */ n_pg.first,
|
||||
/* .n_gen = */ n_pg.second,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
@@ -1040,6 +1058,7 @@ struct test {
|
||||
bool embeddings;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
std::string test_time;
|
||||
std::vector<uint64_t> samples_ns;
|
||||
|
||||
@@ -1072,6 +1091,7 @@ struct test {
|
||||
embeddings = inst.embeddings;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
n_depth = inst.n_depth;
|
||||
// RFC 3339 date-time format
|
||||
time_t t = time(NULL);
|
||||
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
||||
@@ -1114,8 +1134,8 @@ struct test {
|
||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns",
|
||||
"stddev_ns", "avg_ts", "stddev_ts",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
|
||||
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@@ -1125,8 +1145,8 @@ struct test {
|
||||
static field_type get_field_type(const std::string & field) {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
||||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
|
||||
field == "stddev_ns") {
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||
@@ -1204,6 +1224,7 @@ struct test {
|
||||
std::to_string(embeddings),
|
||||
std::to_string(n_prompt),
|
||||
std::to_string(n_gen),
|
||||
std::to_string(n_depth),
|
||||
test_time,
|
||||
std::to_string(avg_ns()),
|
||||
std::to_string(stdev_ns()),
|
||||
@@ -1381,7 +1402,7 @@ struct markdown_printer : public printer {
|
||||
return 4;
|
||||
}
|
||||
if (field == "test") {
|
||||
return 13;
|
||||
return 15;
|
||||
}
|
||||
|
||||
int width = std::max((int) field.length(), 10);
|
||||
@@ -1531,6 +1552,10 @@ struct markdown_printer : public printer {
|
||||
} else {
|
||||
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
||||
}
|
||||
if (t.n_depth > 0) {
|
||||
int len = strlen(buf);
|
||||
snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
|
||||
}
|
||||
value = buf;
|
||||
} else if (field == "t/s") {
|
||||
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
||||
@@ -1789,6 +1814,14 @@ int main(int argc, char ** argv) {
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
if (t.n_depth > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
}
|
||||
test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
||||
}
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
|
||||
if (t.n_prompt > 0) {
|
||||
|
||||
@@ -64,13 +64,7 @@ endif()
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-qwen2vl-cli)
|
||||
add_executable(${TARGET} qwen2vl-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable(${TARGET} mtmd-cli.cpp)
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
#include "gguf.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include "clip.h"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <string>
|
||||
|
||||
+248
-263
@@ -170,8 +170,8 @@ struct clip_hparams {
|
||||
std::vector<int32_t> image_grid_pinpoints;
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size;
|
||||
int32_t n_wa_pattern;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
@@ -325,7 +325,6 @@ struct clip_ctx {
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
bool use_silu = false;
|
||||
int32_t ftype = 1;
|
||||
|
||||
gguf_context_ptr ctx_gguf;
|
||||
ggml_context_ptr ctx_data;
|
||||
@@ -776,7 +775,6 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
|
||||
const int image_size_width = imgs.entries[0]->nx;
|
||||
const int image_size_height = imgs.entries[0]->ny;
|
||||
|
||||
const bool use_mrope = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
||||
|
||||
const int n_wa_pattern = hparams.n_wa_pattern;
|
||||
@@ -785,10 +783,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
|
||||
const int patches_w = image_size_width / patch_size;
|
||||
const int patches_h = image_size_height / patch_size;
|
||||
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
||||
const int num_position_ids = use_mrope ? num_positions * 4 : num_positions;
|
||||
const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
@@ -870,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < ctx->max_feature_layer; il++) {
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
||||
// rmsnorm1
|
||||
@@ -1115,15 +1114,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
||||
int pos_w = image_size_width/patch_size;
|
||||
int pos_h = image_size_height/patch_size;
|
||||
if (ctx->minicpmv_version == 2) {
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
|
||||
}
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
||||
}
|
||||
int n_output_dim = clip_n_mmproj_embd(ctx);
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
|
||||
ggml_set_name(pos_embed, "pos_embed");
|
||||
ggml_set_input(pos_embed);
|
||||
}
|
||||
@@ -1461,23 +1453,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
||||
}
|
||||
|
||||
{ // attention
|
||||
int hidden_size = 4096;
|
||||
int hidden_size = clip_n_mmproj_embd(ctx);
|
||||
const int d_head = 128;
|
||||
int n_head = hidden_size/d_head;
|
||||
int num_query = 96;
|
||||
if (ctx->minicpmv_version == 2) {
|
||||
hidden_size = 4096;
|
||||
n_head = hidden_size/d_head;
|
||||
num_query = 96;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
hidden_size = 3584;
|
||||
n_head = hidden_size/d_head;
|
||||
num_query = 64;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
hidden_size = 3584;
|
||||
n_head = hidden_size/d_head;
|
||||
num_query = 64;
|
||||
}
|
||||
|
||||
@@ -1588,7 +1574,7 @@ struct clip_model_loader {
|
||||
clip_ctx & ctx_clip;
|
||||
std::string fname;
|
||||
|
||||
size_t model_size; // in bytes
|
||||
size_t model_size = 0; // in bytes
|
||||
|
||||
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
|
||||
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
|
||||
@@ -1760,6 +1746,10 @@ struct clip_model_loader {
|
||||
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
||||
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
|
||||
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
|
||||
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
|
||||
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||
LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
|
||||
LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
|
||||
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
|
||||
}
|
||||
@@ -2835,15 +2825,18 @@ void clip_free(clip_ctx * ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
const int32_t nx = ctx->vision_model.hparams.image_size;
|
||||
const int32_t ny = ctx->vision_model.hparams.image_size;
|
||||
return clip_embd_nbytes_by_img(ctx, nx, ny);
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
|
||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
|
||||
clip_image_f32 img;
|
||||
img.nx = img_w;
|
||||
img.ny = img_h;
|
||||
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
}
|
||||
|
||||
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
||||
@@ -2873,14 +2866,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
||||
}
|
||||
|
||||
// deprecated
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
clip_image_f32 img;
|
||||
img.nx = ctx->vision_model.hparams.image_size;
|
||||
img.ny = ctx->vision_model.hparams.image_size;
|
||||
return clip_n_patches_by_img(ctx, &img);
|
||||
return clip_n_output_tokens(ctx, &img);
|
||||
}
|
||||
|
||||
// deprecated
|
||||
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||
return clip_n_output_tokens(ctx, img);
|
||||
}
|
||||
|
||||
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||
const auto & params = ctx->vision_model.hparams;
|
||||
const int n_total = clip_n_output_tokens(ctx, img);
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
|
||||
}
|
||||
return n_total;
|
||||
}
|
||||
|
||||
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||
const auto & params = ctx->vision_model.hparams;
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||
const auto & params = ctx->vision_model.hparams;
|
||||
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
@@ -3038,15 +3054,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
||||
const int pos_w = ctx->load_image_size.width / patch_size;
|
||||
const int pos_w = ctx->load_image_size.width / patch_size;
|
||||
const int pos_h = ctx->load_image_size.height / patch_size;
|
||||
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
||||
|
||||
auto get_inp_tensor = [&gf](const char * name) {
|
||||
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
||||
if (inp == nullptr) {
|
||||
GGML_ABORT("Failed to get tensor %s", name);
|
||||
}
|
||||
if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
|
||||
GGML_ABORT("Tensor %s is not an input tensor", name);
|
||||
}
|
||||
return inp;
|
||||
};
|
||||
|
||||
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
|
||||
ggml_tensor * cur = get_inp_tensor(name);
|
||||
GGML_ASSERT(cur->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
||||
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
||||
};
|
||||
|
||||
auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
|
||||
ggml_tensor * cur = get_inp_tensor(name);
|
||||
GGML_ASSERT(cur->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
||||
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
||||
};
|
||||
|
||||
// set input pixel values
|
||||
{
|
||||
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
||||
std::vector<float> inp_data(ggml_nelements(inp_raw));
|
||||
float * data = inp_data.data();
|
||||
size_t nelem = 0;
|
||||
for (const auto & img : imgs.entries) {
|
||||
nelem += img->nx * img->ny * 3;
|
||||
}
|
||||
std::vector<float> inp_raw(nelem);
|
||||
|
||||
// layout of data (note: the channel dim is unrolled to better visualize the layout):
|
||||
//
|
||||
@@ -3065,7 +3109,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const int n = nx * ny;
|
||||
|
||||
for (int b = 0; b < batch_size; b++) {
|
||||
float * batch_entry = data + b * (3*n);
|
||||
float * batch_entry = inp_raw.data() + b * (3*n);
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
size_t base_src = 3*(y * nx + x); // idx of the first channel
|
||||
@@ -3077,266 +3121,207 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||
set_input_f32("inp_raw", inp_raw);
|
||||
}
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
||||
{
|
||||
// inspired from siglip:
|
||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
std::vector<int> pos_data(ggml_nelements(positions));
|
||||
int * data = pos_data.data();
|
||||
int bucket_coords_h[1024];
|
||||
int bucket_coords_w[1024];
|
||||
for (int i = 0; i < pos_h; i++){
|
||||
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
||||
}
|
||||
for (int i = 0; i < pos_w; i++){
|
||||
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
||||
}
|
||||
for (int i = 0, id = 0; i < pos_h; i++){
|
||||
for (int j = 0; j < pos_w; j++){
|
||||
data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
||||
// set input per projector
|
||||
switch (ctx->proj_type) {
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
{
|
||||
// inspired from siglip:
|
||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||
std::vector<int32_t> positions(pos_h * pos_w);
|
||||
int bucket_coords_h[1024];
|
||||
int bucket_coords_w[1024];
|
||||
for (int i = 0; i < pos_h; i++){
|
||||
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
|
||||
}
|
||||
|
||||
{
|
||||
// inspired from resampler of Qwen-VL:
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
||||
int embed_dim = 4096;
|
||||
if (ctx->minicpmv_version == 2) {
|
||||
embed_dim = 4096;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
embed_dim = 3584;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
embed_dim = 3584;
|
||||
}
|
||||
else {
|
||||
GGML_ABORT("Unknown minicpmv version");
|
||||
}
|
||||
|
||||
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
std::vector<float> pos_data(ggml_nelements(pos_embed));
|
||||
float * data = pos_data.data();
|
||||
for(int i = 0; i < pos_w * pos_h; ++i){
|
||||
for(int j = 0; j < embed_dim; ++j){
|
||||
data[i * embed_dim + j] = pos_embed_t[i][j];
|
||||
for (int i = 0; i < pos_w; i++){
|
||||
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
||||
}
|
||||
}
|
||||
for (int i = 0, id = 0; i < pos_h; i++){
|
||||
for (int j = 0; j < pos_w; j++){
|
||||
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
||||
}
|
||||
}
|
||||
set_input_i32("positions", positions);
|
||||
|
||||
ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed));
|
||||
}
|
||||
}
|
||||
else {
|
||||
// non-minicpmv models
|
||||
// inspired from resampler of Qwen-VL:
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||
int embed_dim = clip_n_mmproj_embd(ctx);
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||
// pw * ph = number of tokens output by ViT after apply patch merger
|
||||
// ipw * ipw = number of vision token been processed inside ViT
|
||||
const int merge_ratio = 2;
|
||||
const int pw = image_size_width / patch_size / merge_ratio;
|
||||
const int ph = image_size_height / patch_size / merge_ratio;
|
||||
const int ipw = image_size_width / patch_size;
|
||||
const int iph = image_size_height / patch_size;
|
||||
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
std::vector<int> idx (ph * pw);
|
||||
std::vector<int> inv_idx(ph * pw);
|
||||
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
|
||||
for(int i = 0; i < pos_w * pos_h; ++i){
|
||||
for(int j = 0; j < embed_dim; ++j){
|
||||
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if (use_window_attn) {
|
||||
const int attn_window_size = 112;
|
||||
struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
|
||||
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
|
||||
struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
|
||||
|
||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
||||
int dst = 0;
|
||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
||||
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
||||
int mask_row = 0;
|
||||
|
||||
for (int y = 0; y < ph; y += grid_window)
|
||||
{
|
||||
for (int x = 0; x < pw; x += grid_window)
|
||||
{
|
||||
const int win_h = std::min(grid_window, ph - y);
|
||||
const int win_w = std::min(grid_window, pw - x);
|
||||
const int dst_0 = dst;
|
||||
// group all tokens belong to the same window togather (to a continue range)
|
||||
for (int dy = 0; dy < win_h; dy++) {
|
||||
for (int dx = 0; dx < win_w; dx++) {
|
||||
const int src = (y + dy) * pw + (x + dx);
|
||||
assert(src < (int)idx.size());
|
||||
assert(dst < (int)inv_idx.size());
|
||||
idx [src] = dst;
|
||||
inv_idx[dst] = src;
|
||||
dst++;
|
||||
set_input_f32("pos_embed", pos_embed);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
{
|
||||
const int merge_ratio = 2;
|
||||
const int pw = image_size_width / patch_size;
|
||||
const int ph = image_size_height / patch_size;
|
||||
std::vector<int> positions(num_positions * 4);
|
||||
int ptr = 0;
|
||||
for (int y = 0; y < ph; y += merge_ratio) {
|
||||
for (int x = 0; x < pw; x += merge_ratio) {
|
||||
for (int dy = 0; dy < 2; dy++) {
|
||||
for (int dx = 0; dx < 2; dx++) {
|
||||
positions[ ptr] = y + dy;
|
||||
positions[ num_patches + ptr] = x + dx;
|
||||
positions[2 * num_patches + ptr] = y + dy;
|
||||
positions[3 * num_patches + ptr] = x + dx;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
||||
int row_offset = mask_row * (ipw * iph);
|
||||
std::fill(
|
||||
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
||||
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
||||
0.0);
|
||||
mask_row++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
|
||||
ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
|
||||
ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
|
||||
} else {
|
||||
std::iota(idx.begin(), idx.end(), 0);
|
||||
std::iota(inv_idx.begin(), inv_idx.end(), 0);
|
||||
}
|
||||
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
const int mpow = merge_ratio * merge_ratio;
|
||||
std::vector<int> positions_data(ggml_nelements(positions));
|
||||
int * data = positions_data.data();
|
||||
|
||||
int ptr = 0;
|
||||
for (int y = 0; y < iph; y += merge_ratio)
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
{
|
||||
for (int x = 0; x < ipw; x += merge_ratio)
|
||||
{
|
||||
for (int dy = 0; dy < 2; dy++) {
|
||||
for (int dx = 0; dx < 2; dx++) {
|
||||
auto remap = idx[ptr / mpow];
|
||||
remap = remap * mpow + (ptr % mpow);
|
||||
// pw * ph = number of tokens output by ViT after apply patch merger
|
||||
// ipw * ipw = number of vision token been processed inside ViT
|
||||
const int merge_ratio = 2;
|
||||
const int pw = image_size_width / patch_size / merge_ratio;
|
||||
const int ph = image_size_height / patch_size / merge_ratio;
|
||||
const int ipw = image_size_width / patch_size;
|
||||
const int iph = image_size_height / patch_size;
|
||||
|
||||
data[ remap] = y + dy;
|
||||
data[ num_patches + remap] = x + dx;
|
||||
data[2 * num_patches + remap] = y + dy;
|
||||
data[3 * num_patches + remap] = x + dx;
|
||||
ptr++;
|
||||
std::vector<int> idx (ph * pw);
|
||||
std::vector<int> inv_idx(ph * pw);
|
||||
|
||||
if (use_window_attn) {
|
||||
const int attn_window_size = 112;
|
||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
||||
int dst = 0;
|
||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
||||
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
||||
int mask_row = 0;
|
||||
|
||||
for (int y = 0; y < ph; y += grid_window) {
|
||||
for (int x = 0; x < pw; x += grid_window) {
|
||||
const int win_h = std::min(grid_window, ph - y);
|
||||
const int win_w = std::min(grid_window, pw - x);
|
||||
const int dst_0 = dst;
|
||||
// group all tokens belong to the same window togather (to a continue range)
|
||||
for (int dy = 0; dy < win_h; dy++) {
|
||||
for (int dx = 0; dx < win_w; dx++) {
|
||||
const int src = (y + dy) * pw + (x + dx);
|
||||
GGML_ASSERT(src < (int)idx.size());
|
||||
GGML_ASSERT(dst < (int)inv_idx.size());
|
||||
idx [src] = dst;
|
||||
inv_idx[dst] = src;
|
||||
dst++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
||||
int row_offset = mask_row * (ipw * iph);
|
||||
std::fill(
|
||||
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
||||
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
||||
0.0);
|
||||
mask_row++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
set_input_i32("window_idx", idx);
|
||||
set_input_i32("inv_window_idx", inv_idx);
|
||||
set_input_f32("window_mask", mask);
|
||||
} else {
|
||||
for (int i = 0; i < ph * pw; i++) {
|
||||
idx[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
const int mpow = merge_ratio * merge_ratio;
|
||||
std::vector<int> positions(num_positions * 4);
|
||||
|
||||
int ptr = 0;
|
||||
for (int y = 0; y < iph; y += merge_ratio) {
|
||||
for (int x = 0; x < ipw; x += merge_ratio) {
|
||||
for (int dy = 0; dy < 2; dy++) {
|
||||
for (int dx = 0; dx < 2; dx++) {
|
||||
auto remap = idx[ptr / mpow];
|
||||
remap = (remap * mpow) + (ptr % mpow);
|
||||
|
||||
positions[ remap] = y + dy;
|
||||
positions[ num_patches + remap] = x + dx;
|
||||
positions[2 * num_patches + remap] = y + dy;
|
||||
positions[3 * num_patches + remap] = x + dx;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
// do nothing
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// do nothing
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
||||
// set the 2D positions
|
||||
int n_patches_per_col = image_size_width / patch_size;
|
||||
std::vector<int> pos_data(num_positions);
|
||||
struct ggml_tensor * pos;
|
||||
// dimension H
|
||||
pos = ggml_graph_get_tensor(gf, "pos_h");
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
pos_data[i] = i / n_patches_per_col;
|
||||
}
|
||||
ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
|
||||
// dimension W
|
||||
pos = ggml_graph_get_tensor(gf, "pos_w");
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
pos_data[i] = i % n_patches_per_col;
|
||||
}
|
||||
ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
|
||||
}
|
||||
else {
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
{
|
||||
// set the 2D positions
|
||||
int n_patches_per_col = image_size_width / patch_size;
|
||||
std::vector<int> pos_data(num_positions);
|
||||
// dimension H
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
pos_data[i] = i / n_patches_per_col;
|
||||
}
|
||||
set_input_i32("pos_h", pos_data);
|
||||
// dimension W
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
pos_data[i] = i % n_patches_per_col;
|
||||
}
|
||||
set_input_i32("pos_w", pos_data);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM_EDGE:
|
||||
{
|
||||
// llava and other models
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
std::vector<int32_t> positions(num_positions);
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
positions_data[i] = i;
|
||||
positions[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
case PROJECTOR_TYPE_LDPV2:
|
||||
{
|
||||
// llava and other models
|
||||
std::vector<int32_t> positions(num_positions);
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
positions[i] = i;
|
||||
}
|
||||
set_input_i32("positions", positions);
|
||||
|
||||
if (ctx->proj_type != PROJECTOR_TYPE_GLM_EDGE) {
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
// The patches vector is used to get rows to index into the embeds with;
|
||||
// we should skip dim 0 only if we have CLS to avoid going out of bounds
|
||||
// when retrieving the rows.
|
||||
int patch_offset = model.class_embedding ? 1 : 0;
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
std::vector<int32_t> patches(num_patches);
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
patches_data[i] = i + patch_offset;
|
||||
patches[i] = i + patch_offset;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (use_window_attn && (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL)) {
|
||||
struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
|
||||
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
|
||||
struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
|
||||
|
||||
const int merge_ratio = 2;
|
||||
const int attn_window_size = 112;
|
||||
const int pw = image_size_width / patch_size / merge_ratio;
|
||||
const int ph = image_size_height / patch_size / merge_ratio;
|
||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
||||
const int ipw = image_size_width / patch_size;
|
||||
const int iph = image_size_height / patch_size;
|
||||
/*
|
||||
pw * ph = number of tokens output by ViT after apply patch merger
|
||||
ipw * ipw = number of vision token been processed inside ViT
|
||||
*/
|
||||
|
||||
std::vector<int> idx(ph * pw);
|
||||
std::vector<int> inv_idx(ph * pw);
|
||||
int dst = 0;
|
||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
||||
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
||||
int mask_row = 0;
|
||||
|
||||
for (int y = 0; y < ph; y+=grid_window)
|
||||
{
|
||||
for (int x = 0; x < pw; x+=grid_window)
|
||||
set_input_i32("patches", patches);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
const int win_h = std::min(grid_window, ph - y);
|
||||
const int win_w = std::min(grid_window, pw - x);
|
||||
const int dst_0 = dst;
|
||||
// group all tokens belong to the same window togather (to a continue range)
|
||||
for (int dy = 0; dy < win_h; dy++) {
|
||||
for (int dx = 0; dx < win_w; dx++) {
|
||||
const int src = (y + dy) * pw + (x + dx);
|
||||
assert(src < (int)idx.size());
|
||||
assert(dst < (int)inv_idx.size());
|
||||
idx[src] = dst;
|
||||
inv_idx[dst] = src;
|
||||
dst++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
||||
int row_offset = mask_row * (ipw * iph);
|
||||
std::fill(
|
||||
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
||||
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
||||
0.0);
|
||||
mask_row++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
|
||||
ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
|
||||
ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
|
||||
// do nothing
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("Unknown projector type");
|
||||
}
|
||||
|
||||
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
||||
@@ -3537,7 +3522,7 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL;
|
||||
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
|
||||
+15
-4
@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
||||
|
||||
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||
@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
||||
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
|
||||
"use clip_n_output_tokens instead");
|
||||
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
|
||||
"use clip_n_output_tokens instead");
|
||||
|
||||
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// for M-RoPE, this will be the number of token positions in X and Y directions
|
||||
// for other models, X will be the total number of tokens and Y will be 1
|
||||
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// this should be equal to the embedding dimension of the text model
|
||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
|
||||
@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||
}
|
||||
|
||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
|
||||
struct {
|
||||
struct ggml_context * ctx;
|
||||
} model;
|
||||
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
|
||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||
// fill it with the image embeddings, ignoring the base
|
||||
for (size_t i = 1; i < num_images; i++) {
|
||||
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
||||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
|
||||
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
||||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
|
||||
|
||||
// Debug: Test single segments
|
||||
// Current findings: sending base image, sending a segment embedding all works similar to python
|
||||
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||
image_embd_v[i],
|
||||
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
|
||||
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||
|
||||
int n_img_pos_out;
|
||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
|
||||
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
|
||||
*n_img_pos = n_img_pos_out;
|
||||
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
|
||||
@@ -136,39 +136,6 @@ struct mtmd_cli_context {
|
||||
}
|
||||
};
|
||||
|
||||
struct decode_embd_batch {
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_id_0[0] = seq_id;
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
for (int i = 0; i < n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
|
||||
llama_tokens generated_tokens;
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
@@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
|
||||
ctx.n_past += mtmd_helper_get_n_pos(chunks);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -371,6 +338,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
if (g_is_interrupted) LOG("\nInterrupted by user\n");
|
||||
LOG("\n\n");
|
||||
llama_perf_context_print(ctx.lctx);
|
||||
return g_is_interrupted ? 130 : 0;
|
||||
}
|
||||
|
||||
+128
-28
@@ -40,11 +40,14 @@ struct mtmd_context {
|
||||
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
||||
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
||||
|
||||
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
mtmd_context(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params & ctx_params) :
|
||||
text_model (text_model),
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
image_marker (ctx_params.image_marker)
|
||||
@@ -56,9 +59,8 @@ struct mtmd_context {
|
||||
if (!ctx_clip) {
|
||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
||||
}
|
||||
this->text_model = text_model;
|
||||
|
||||
GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
|
||||
use_mrope = clip_is_qwen2vl(ctx_clip);
|
||||
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
||||
if (minicpmv_version == 2) {
|
||||
@@ -126,6 +128,7 @@ struct mtmd_image_tokens_data {
|
||||
struct mtmd_image_tokens {
|
||||
uint32_t nx; // number of tokens in x direction
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
|
||||
uint32_t n_tokens() const { return nx * ny; }
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
@@ -202,10 +205,14 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
}
|
||||
|
||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||
// for glm-edge, we don't need to add because the tokens are already in the returned embeddings
|
||||
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
|
||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||
|
||||
// TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens
|
||||
}
|
||||
|
||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||
|
||||
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
||||
output.clear();
|
||||
@@ -229,7 +236,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
|
||||
for (auto & entry : batch_f32.entries) {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
|
||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||
image_tokens->ny = 1;
|
||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
||||
image_tokens->id = id;
|
||||
@@ -246,7 +253,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
};
|
||||
|
||||
for (const auto & part : parts) {
|
||||
//printf("tokenizing part: %s\n", part.c_str());
|
||||
// printf("tokenizing part: %s\n", part.c_str());
|
||||
bool add_bos = &parts.front() == ∂
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
|
||||
if (tokens.empty()) {
|
||||
@@ -325,12 +332,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
} else {
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = n_tokens;
|
||||
image_tokens->ny = 1; // TODO
|
||||
if (ctx->use_mrope) {
|
||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||
image_tokens->use_mrope_pos = true;
|
||||
} else {
|
||||
// other models, we only need the total number of tokens
|
||||
image_tokens->nx = n_tokens;
|
||||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmaps[i_img].id; // optional
|
||||
|
||||
@@ -338,11 +353,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||
|
||||
if (clip_is_glm(ctx->ctx_clip)) {
|
||||
// glm-edge
|
||||
image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
|
||||
}
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{},
|
||||
@@ -380,6 +390,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
||||
return image_tokens->id;
|
||||
}
|
||||
|
||||
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
||||
if (image_tokens->use_mrope_pos) {
|
||||
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
|
||||
}
|
||||
return image_tokens->n_tokens();
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||
@@ -397,7 +414,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
|
||||
ok = clip_image_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->n_threads,
|
||||
@@ -425,7 +442,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
n_tokens += chunk.tokens_text.size();
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
n_tokens += chunk.tokens_image->n_tokens();
|
||||
n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
@@ -433,22 +450,38 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
|
||||
return n_tokens;
|
||||
}
|
||||
|
||||
llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
|
||||
llama_pos n_pos = 0;
|
||||
for (auto & chunk : chunks) {
|
||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
n_pos += chunk.tokens_text.size();
|
||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
|
||||
} else {
|
||||
GGML_ASSERT(false && "chunk type not supported");
|
||||
}
|
||||
}
|
||||
return n_pos;
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
int n_pos_per_embd;
|
||||
int n_mmproj_embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<llama_pos> pos_view; // used by mrope
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_id_0[0] = seq_id;
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
@@ -459,13 +492,64 @@ struct decode_embd_batch {
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
for (int i = 0; i < n_tokens; i++) {
|
||||
}
|
||||
|
||||
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
int i = y * nx + x;
|
||||
pos[i ] = pos_0;
|
||||
pos[i + batch.n_tokens ] = pos_0 + y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.resize(n_tokens * n_pos_per_embd);
|
||||
if (n_pos_per_embd > 1) {
|
||||
// mrope
|
||||
// for example, with layout of src: 1234...1234...1234...1234...
|
||||
// offset 2 will give us dst: 34...34...34...34...
|
||||
for (int i = 0; i < n_pos_per_embd; i++) {
|
||||
auto src = pos.begin() + i * batch.n_tokens + offset;
|
||||
pos_view.insert(pos_view.end(), src, src + n_tokens);
|
||||
}
|
||||
pos_ptr = pos_view.data();
|
||||
} else {
|
||||
// normal
|
||||
pos_ptr = pos.data() + offset;
|
||||
}
|
||||
return {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||
/*pos =*/ pos_ptr,
|
||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||
/*seq_id =*/ batch.seq_id + offset,
|
||||
/*logits =*/ batch.logits + offset,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
@@ -478,6 +562,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
llama_pos n_past = pos0;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||
|
||||
for (auto & chunk : chunks) {
|
||||
bool is_last = &chunk == &chunks.back();
|
||||
@@ -525,6 +610,16 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
|
||||
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, false);
|
||||
@@ -532,15 +627,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
}
|
||||
|
||||
while (i_batch < n_img_batches) { // split into batches
|
||||
int32_t pos_offset = i_batch*n_batch;
|
||||
int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
float * embd_batch = embd + pos_offset*n_mmproj_embd;
|
||||
decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
|
||||
int pos_offset = i_batch*n_batch;
|
||||
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
||||
|
||||
printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
||||
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
ret = llama_decode(lctx, batch_img.batch);
|
||||
ret = llama_decode(lctx, batch_embd_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode image\n");
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
@@ -553,9 +647,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
}
|
||||
|
||||
i_batch++;
|
||||
n_past += n_tokens_batch;
|
||||
}
|
||||
|
||||
// for mrope, one image is one single **temporal** position
|
||||
n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
@@ -603,6 +699,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
return ctx->use_mrope;
|
||||
}
|
||||
|
||||
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
||||
mtmd_image_tokens_free(val);
|
||||
}
|
||||
|
||||
@@ -102,6 +102,7 @@ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im
|
||||
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
||||
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
|
||||
|
||||
// returns 0 on success
|
||||
@@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
// whether we need to set non-causal mask before llama_decode
|
||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
||||
|
||||
// whether the current model use M-RoPE for llama_decode
|
||||
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
||||
|
||||
|
||||
|
||||
//
|
||||
// helper functions (can be implemented based on other functions)
|
||||
//
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
|
||||
|
||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||
MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
|
||||
@@ -27,6 +27,8 @@
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
|
||||
// IT IS NOT A PRODUCTION CODE
|
||||
|
||||
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
||||
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
||||
@@ -92,20 +94,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
|
||||
int N = (int) tokens.size();
|
||||
std::vector<llama_pos> pos;
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
auto batch = llama_batch_get_one(&tokens[i], n_eval);
|
||||
// TODO: add mrope pos ids somewhere else
|
||||
pos.resize(batch.n_tokens * 4);
|
||||
std::fill(pos.begin(), pos.end(), 0);
|
||||
for (int j = 0; j < batch.n_tokens * 3; j ++) {
|
||||
pos[j] = *st_pos_id + (j % batch.n_tokens);
|
||||
}
|
||||
batch.pos = pos.data();
|
||||
|
||||
if (llama_decode(ctx_llama, batch)) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
@@ -54,8 +54,8 @@ add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-qwen2vl-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
|
||||
# to test the big models, run: ./tests.sh big
|
||||
add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
|
||||
@@ -304,8 +304,9 @@ int main(int argc, char * argv[]) {
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
}
|
||||
const char * cache_dir = nullptr;
|
||||
std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
|
||||
std::string cache_dir_str;
|
||||
if (params.use_cache) {
|
||||
cache_dir_str = fs_get_cache_directory() + "rpc/";
|
||||
if (!fs_create_directory_with_parents(cache_dir_str)) {
|
||||
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
|
||||
return 1;
|
||||
|
||||
@@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse(
|
||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||
}
|
||||
|
||||
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
||||
// for ex. this can be useful to modify the reasoning process in reasoning models
|
||||
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
|
||||
common_chat_msg last_message;
|
||||
if (prefill_assistant_message) {
|
||||
last_message = inputs.messages.back();
|
||||
inputs.messages.pop_back();
|
||||
|
||||
/* sanity check, max one assistant message at the end of the list */
|
||||
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
|
||||
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
|
||||
}
|
||||
|
||||
inputs.extract_reasoning = false;
|
||||
inputs.add_generation_prompt = true;
|
||||
}
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
||||
|
||||
/* Append assistant prefilled message */
|
||||
if (prefill_assistant_message) {
|
||||
chat_params.prompt += last_message.content;
|
||||
}
|
||||
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
if (!chat_params.grammar.empty()) {
|
||||
|
||||
@@ -352,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
# TODO: Separation to determine activation of VX/VXE/VXE2
|
||||
if (${S390X_M} MATCHES "8561|8562")
|
||||
message(STATUS "z15 target")
|
||||
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
|
||||
list(APPEND ARCH_FLAGS -march=z15)
|
||||
elseif (${S390X_M} MATCHES "3931")
|
||||
message(STATUS "z16 target")
|
||||
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
|
||||
list(APPEND ARCH_FLAGS -march=z16)
|
||||
elseif (${S390X_M} MATCHES "9175|9176")
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=z17)
|
||||
else()
|
||||
message(STATUS "Unknown target")
|
||||
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
||||
|
||||
@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 vector float
|
||||
#define GGML_F32x4_ZERO 0.0f
|
||||
#define GGML_F32x4_ZERO {0.0f}
|
||||
#define GGML_F32x4_SET1 vec_splats
|
||||
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
||||
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "convert.cuh"
|
||||
#include "dequantize.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
static __global__ void convert_unary(
|
||||
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03) {
|
||||
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i01 = blockIdx.y;
|
||||
const int64_t i02 = blockIdx.z % ne02;
|
||||
const int64_t i03 = blockIdx.z / ne02;
|
||||
|
||||
const src_t * x = (const src_t *) vx;
|
||||
|
||||
y[i] = float(x[i]);
|
||||
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
|
||||
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
|
||||
y[iy] = float(x[ix]);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
static void convert_unary_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
|
||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
||||
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
|
||||
}
|
||||
|
||||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
return convert_unary_cont_cuda<half>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
return convert_unary_cont_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return dequantize_row_iq3_s_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
return convert_unary_cont_cuda<half>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cont_cuda<nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
default:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
|
||||
template<typename T>
|
||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
|
||||
|
||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
|
||||
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
|
||||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
|
||||
|
||||
// TODO more general support for non-contiguous inputs
|
||||
|
||||
template<typename T>
|
||||
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
|
||||
|
||||
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
|
||||
@@ -1720,15 +1720,15 @@ static __global__ void k_compute_batched_ptrs(
|
||||
size_t nb12, size_t nb13,
|
||||
size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3) {
|
||||
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i13 >= ne13 || i12 >= ne12) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t i03 = i13 / r3;
|
||||
int64_t i02 = i12 / r2;
|
||||
const int64_t i03 = i13 / r3;
|
||||
const int64_t i02 = i12 / r2;
|
||||
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
||||
@@ -1742,6 +1742,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
|
||||
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
|
||||
// As long as dst is contiguous this does not matter though.
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int64_t ne_dst = ggml_nelements(dst);
|
||||
@@ -1750,21 +1754,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
|
||||
|
||||
void * src0_ddq = src0->data;
|
||||
half * src0_f16 = (half *) src0_ddq;
|
||||
float * src1_ddf = (float *) src1->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
const half * src0_f16 = (const half *) src0->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
const half * src1_f16 = (const half *) src1->data;
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
GGML_ASSERT(nb10 == ts_src1);
|
||||
int64_t s11 = nb11 / ts_src1;
|
||||
int64_t s12 = nb12 / ts_src1;
|
||||
int64_t s13 = nb13 / ts_src1;
|
||||
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
|
||||
|
||||
// convert src1 to fp16
|
||||
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
||||
|
||||
to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
|
||||
|
||||
src1_f16 = src1_f16_alloc.get();
|
||||
s11 = ne10;
|
||||
s12 = ne11*s11;
|
||||
s13 = ne12*s12;
|
||||
}
|
||||
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
|
||||
|
||||
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
|
||||
char * dst_t;
|
||||
@@ -1824,13 +1838,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
int i02 = i12 / r2;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
|
||||
src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
|
||||
beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1841,15 +1855,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
||||
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
|
||||
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
|
||||
alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
||||
src1_f16, CUDA_R_16F, s11, s12, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
ne12*ne13,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
// use cublasGemmBatchedEx
|
||||
const int ne23 = ne12*ne13;
|
||||
const int64_t ne23 = ne12*ne13;
|
||||
|
||||
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
||||
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
||||
@@ -1861,8 +1875,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
ne12, ne13,
|
||||
ne23,
|
||||
nb02, nb03,
|
||||
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
|
||||
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
|
||||
src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
|
||||
src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
|
||||
nbd2, nbd3,
|
||||
r2, r3);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -1871,8 +1885,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
||||
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
|
||||
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
||||
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
|
||||
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
|
||||
ne23,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
@@ -1936,7 +1950,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
} else if (!split && use_mul_mat_vec_q) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// general KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_mul_mat_vec) {
|
||||
|
||||
@@ -982,8 +982,21 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
||||
}
|
||||
|
||||
ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
|
||||
// Validate tensor type before using it
|
||||
if (tensor->type >= GGML_TYPE_COUNT) {
|
||||
GGML_LOG_ERROR("[%s] invalid tensor type received: %u\n", __func__, tensor->type);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
|
||||
// ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
|
||||
if (result == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
result->nb[i] = tensor->nb[i];
|
||||
}
|
||||
@@ -1043,7 +1056,9 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||
|
||||
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu) out of buffer bounds [0x%zx, 0x%zx)\n",
|
||||
__func__, in_tensor->data, offset, size, p0, p1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1118,7 +1133,9 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
|
||||
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||
|
||||
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
|
||||
__func__, in_tensor->data, offset, size, *hash, p0, p1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
|
||||
@@ -1183,7 +1200,9 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
||||
if (request.tensor.data + request.offset < p0 ||
|
||||
request.tensor.data + request.offset >= p1 ||
|
||||
request.size > (p1 - request.tensor.data - request.offset)) {
|
||||
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||
GGML_LOG_ERROR("[%s] requested tensor region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%" PRIu64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
|
||||
__func__, request.tensor.data, request.offset, request.size, p0, p1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1237,22 +1256,50 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
|
||||
struct ggml_context * ctx,
|
||||
const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
|
||||
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
|
||||
if (id == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
if (tensor_map.find(id) != tensor_map.end()) {
|
||||
return tensor_map[id];
|
||||
}
|
||||
const rpc_tensor * tensor = tensor_ptrs.at(id);
|
||||
// Safely find the tensor pointer
|
||||
auto it_ptr = tensor_ptrs.find(id);
|
||||
if (it_ptr == tensor_ptrs.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
const rpc_tensor * tensor = it_ptr->second;
|
||||
|
||||
struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
|
||||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
tensor_map[id] = result;
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
|
||||
// Check if the source ID is 0 before calling create_node recursively
|
||||
if (tensor->src[i] == 0) {
|
||||
result->src[i] = nullptr;
|
||||
} else {
|
||||
result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
|
||||
// If the recursive call failed for a non-zero ID, propagate the error
|
||||
if (result->src[i] == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] failed to create source node %d (src_id=%" PRIu64 ") for node id %" PRIu64 "\n",
|
||||
__func__, i, tensor->src[i], id);
|
||||
// Must return nullptr to signal failure up the call stack
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle view_src similarly
|
||||
if (tensor->view_src == 0) {
|
||||
result->view_src = nullptr;
|
||||
} else {
|
||||
result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
|
||||
// If the recursive call failed for a non-zero ID, propagate the error
|
||||
if (result->view_src == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] failed to create view_src node (view_src_id=%" PRIu64 ") for node id %" PRIu64 "\n",
|
||||
__func__, tensor->view_src, id);
|
||||
// Must return nullptr to signal failure up the call stack
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
|
||||
result->view_offs = tensor->view_offs;
|
||||
return result;
|
||||
}
|
||||
@@ -1278,6 +1325,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
|
||||
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||
|
||||
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
@@ -1297,6 +1345,14 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
|
||||
int64_t id;
|
||||
memcpy(&id, &nodes[i], sizeof(id));
|
||||
graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
|
||||
|
||||
// Check if create_node failed for a *non-zero* ID.
|
||||
// If id was 0, create_node returning nullptr is expected.
|
||||
// If id was non-zero and create_node returned nullptr, it indicates a deserialization error.
|
||||
if (graph->nodes[i] == nullptr && id != 0) {
|
||||
GGML_LOG_ERROR("[%s] failed to create graph node %d (id=%" PRId64 ")\n", __func__, i, id);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_status status = ggml_backend_graph_compute(backend, graph);
|
||||
response.result = status;
|
||||
@@ -1361,7 +1417,9 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
return;
|
||||
}
|
||||
rpc_msg_get_alloc_size_rsp response;
|
||||
server.get_alloc_size(request, response);
|
||||
if (!server.get_alloc_size(request, response)) {
|
||||
return;
|
||||
}
|
||||
if (!send_msg(sockfd, &response, sizeof(response))) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -482,7 +482,7 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
|
||||
const uint ib8 = (idx & 0x18) >> 3; // 0..3
|
||||
const uint iqs = 8 * ib32 + ib8;
|
||||
|
||||
const uint8_t qs = bl.block.qs[iqs];
|
||||
const uint qs = bl.block.qs[iqs];
|
||||
const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
|
||||
|
||||
const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
|
||||
|
||||
@@ -104,6 +104,7 @@ class Keys:
|
||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
||||
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
||||
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||
@@ -267,6 +268,7 @@ class MODEL_ARCH(IntEnum):
|
||||
REFACT = auto()
|
||||
BERT = auto()
|
||||
NOMIC_BERT = auto()
|
||||
NOMIC_BERT_MOE = auto()
|
||||
JINA_BERT_V2 = auto()
|
||||
BLOOM = auto()
|
||||
STABLELM = auto()
|
||||
@@ -521,6 +523,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.REFACT: "refact",
|
||||
MODEL_ARCH.BERT: "bert",
|
||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
||||
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||
MODEL_ARCH.BLOOM: "bloom",
|
||||
MODEL_ARCH.STABLELM: "stablelm",
|
||||
@@ -960,6 +963,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||
],
|
||||
MODEL_ARCH.NOMIC_BERT_MOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
MODEL_TENSOR.TOKEN_TYPES,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||
],
|
||||
MODEL_ARCH.JINA_BERT_V2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
|
||||
@@ -728,6 +728,9 @@ class GGUFWriter:
|
||||
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
|
||||
|
||||
def add_moe_every_n_layers(self, value: int) -> None:
|
||||
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
|
||||
|
||||
def add_swin_norm(self, value: bool) -> None:
|
||||
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
||||
|
||||
|
||||
@@ -290,6 +290,7 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
||||
"language_model.model.layers.{bid}.feed_forward.router", # llama4
|
||||
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
@@ -322,6 +323,7 @@ class TensorNameMap:
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||
@@ -337,6 +339,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||
"language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
@@ -418,6 +421,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||
"language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
|
||||
@@ -1232,6 +1232,7 @@ extern "C" {
|
||||
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
/// Setting k <= 0 makes this a noop
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||
|
||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
|
||||
@@ -19,9 +19,9 @@ logger = logging.getLogger("compare-llama-bench")
|
||||
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch",
|
||||
"embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload",
|
||||
"split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
|
||||
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
|
||||
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
|
||||
]
|
||||
|
||||
# Properties that are boolean and are converted to Yes/No for the table:
|
||||
@@ -30,11 +30,11 @@ BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "fla
|
||||
# Header names for the table:
|
||||
PRETTY_NAMES = {
|
||||
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
||||
"model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
||||
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||
"embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll",
|
||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU",
|
||||
"no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap",
|
||||
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
||||
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
|
||||
"cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll", "n_threads": "Threads", "type_k": "K type", "type_v": "V type",
|
||||
"use_mmap": "Use mmap", "no_kv_offload": "NKVO", "split_mode": "Split mode", "main_gpu": "Main GPU", "tensor_split": "Tensor split",
|
||||
"flash_attn": "FlashAttention",
|
||||
}
|
||||
|
||||
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
||||
@@ -281,12 +281,12 @@ def get_rows(properties):
|
||||
The returned rows are unique in terms of property combinations.
|
||||
"""
|
||||
select_string = ", ".join(
|
||||
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
|
||||
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
|
||||
equal_string = " AND ".join(
|
||||
[f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
|
||||
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
|
||||
)
|
||||
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt"])
|
||||
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
|
||||
query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
|
||||
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
||||
return cursor.execute(query).fetchall()
|
||||
@@ -309,7 +309,7 @@ else:
|
||||
rows_full = get_rows(KEY_PROPERTIES)
|
||||
properties_different = []
|
||||
for i, kp_i in enumerate(KEY_PROPERTIES):
|
||||
if kp_i in DEFAULT_SHOW or kp_i == "n_prompt" or kp_i == "n_gen":
|
||||
if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
|
||||
continue
|
||||
for row_full in rows_full:
|
||||
if row_full[i] != rows_full[0][i]:
|
||||
@@ -340,17 +340,20 @@ else:
|
||||
|
||||
table = []
|
||||
for row in rows_show:
|
||||
n_prompt = int(row[-4])
|
||||
n_gen = int(row[-3])
|
||||
n_prompt = int(row[-5])
|
||||
n_gen = int(row[-4])
|
||||
n_depth = int(row[-3])
|
||||
if n_prompt != 0 and n_gen == 0:
|
||||
test_name = f"pp{n_prompt}"
|
||||
elif n_prompt == 0 and n_gen != 0:
|
||||
test_name = f"tg{n_gen}"
|
||||
else:
|
||||
test_name = f"pp{n_prompt}+tg{n_gen}"
|
||||
if n_depth != 0:
|
||||
test_name = f"{test_name}@d{n_depth}"
|
||||
# Regular columns test name avg t/s values Speedup
|
||||
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
||||
table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
||||
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
||||
|
||||
# Some a-posteriori fixes to make the table contents prettier:
|
||||
for bool_property in BOOL_PROPERTIES:
|
||||
@@ -376,7 +379,7 @@ if "gpu_info" in show:
|
||||
for gns in GPU_NAME_STRIP:
|
||||
row_table[ip] = row_table[ip].replace(gns, "")
|
||||
|
||||
gpu_names = row_table[ip].split("/")
|
||||
gpu_names = row_table[ip].split(", ")
|
||||
num_gpus = len(gpu_names)
|
||||
all_names_the_same = len(set(gpu_names)) == 1
|
||||
if len(gpu_names) >= 2 and all_names_the_same:
|
||||
|
||||
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_REFACT, "refact" },
|
||||
{ LLM_ARCH_BERT, "bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
@@ -106,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
@@ -472,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
{
|
||||
|
||||
@@ -23,6 +23,7 @@ enum llm_arch {
|
||||
LLM_ARCH_REFACT,
|
||||
LLM_ARCH_BERT,
|
||||
LLM_ARCH_NOMIC_BERT,
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
@@ -110,6 +111,7 @@ enum llm_kv {
|
||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||
LLM_KV_EXPERT_GATING_FUNC,
|
||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
|
||||
+1
-9
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
ss << "[gMASK]" << "<sop>";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
@@ -456,14 +456,6 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
for (auto message : chat) {
|
||||
|
||||
@@ -114,7 +114,7 @@ llama_context::llama_context(
|
||||
}
|
||||
|
||||
if (n_ctx_per_seq > hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
||||
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
@@ -1536,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
// set all ids as invalid (negative)
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
|
||||
ggml_backend_buffer_clear(buf_output.get(), 0);
|
||||
|
||||
this->n_outputs = 0;
|
||||
this->n_outputs_max = n_outputs_max;
|
||||
|
||||
|
||||
+38
-16
@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && pos) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
|
||||
if (ubatch->token && n_pos_per_embd == 4) {
|
||||
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
|
||||
// the 3 first dims are the same, and 4th dim is all 0
|
||||
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
|
||||
// copy the first dimension
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
pos_data[ i] = ubatch->pos[i];
|
||||
pos_data[ n_tokens + i] = ubatch->pos[i];
|
||||
pos_data[2 * n_tokens + i] = ubatch->pos[i];
|
||||
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
|
||||
}
|
||||
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
|
||||
} else {
|
||||
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||
) * f_attn_temp_scale + 1.0;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
|
||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
res (std::make_unique<llm_graph_result>()) {
|
||||
}
|
||||
|
||||
int64_t llm_graph_context::n_pos_per_token() const {
|
||||
int64_t llm_graph_context::n_pos_per_embd() const {
|
||||
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
||||
}
|
||||
|
||||
@@ -914,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
|
||||
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(gate, "ffn_moe_gate", il);
|
||||
ggml_tensor * experts = nullptr;
|
||||
if (gate_exps) {
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
} else {
|
||||
cur = up;
|
||||
}
|
||||
|
||||
switch (type_op) {
|
||||
case LLM_FFN_SILU:
|
||||
{
|
||||
gate = ggml_silu(ctx0, gate);
|
||||
cb(gate, "ffn_moe_silu", il);
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_silu", il);
|
||||
} break;
|
||||
case LLM_FFN_GELU:
|
||||
{
|
||||
gate = ggml_gelu(ctx0, gate);
|
||||
cb(gate, "ffn_moe_gelu", il);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_gelu", il);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(par, "ffn_moe_gate_par", il);
|
||||
if (gate_exps) {
|
||||
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate_par", il);
|
||||
}
|
||||
|
||||
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
cb(experts, "ffn_moe_down", il);
|
||||
|
||||
if (!weight_before_ffn) {
|
||||
@@ -1018,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
|
||||
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
|
||||
|
||||
auto & cur = inp->pos;
|
||||
|
||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
|
||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
@@ -1031,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
|
||||
auto & cur = inp->attn_scale;
|
||||
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
|
||||
// this need to be 1x1xN for broadcasting
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
+5
-7
@@ -90,29 +90,27 @@ public:
|
||||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
|
||||
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
||||
virtual ~llm_graph_input_pos() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
||||
|
||||
const int64_t n_pos_per_token = 1;
|
||||
const int64_t n_pos_per_embd = 1;
|
||||
};
|
||||
|
||||
// temperature tuning, used by llama4
|
||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
virtual ~llm_graph_input_attn_temp() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
||||
|
||||
const int64_t n_pos_per_token = 1;
|
||||
|
||||
const uint32_t n_attn_temp_floor_scale;
|
||||
const float f_attn_temp_scale;
|
||||
};
|
||||
@@ -419,7 +417,7 @@ struct llm_graph_context {
|
||||
|
||||
llm_graph_context(const llm_graph_params & params);
|
||||
|
||||
int64_t n_pos_per_token() const;
|
||||
int64_t n_pos_per_embd() const;
|
||||
|
||||
void cb(ggml_tensor * cur, const char * name, int il) const;
|
||||
|
||||
|
||||
@@ -66,6 +66,7 @@ struct llama_hparams {
|
||||
float expert_weights_scale = 0.0;
|
||||
bool expert_weights_norm = false;
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
uint32_t moe_every_n_layers = 0;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
||||
+59
-12
@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_770M: return "770M";
|
||||
case LLM_TYPE_780M: return "780M";
|
||||
case LLM_TYPE_0_5B: return "0.5B";
|
||||
case LLM_TYPE_0_6B: return "0.6B";
|
||||
case LLM_TYPE_1B: return "1B";
|
||||
case LLM_TYPE_1_3B: return "1.3B";
|
||||
case LLM_TYPE_1_4B: return "1.4B";
|
||||
case LLM_TYPE_1_5B: return "1.5B";
|
||||
case LLM_TYPE_1_6B: return "1.6B";
|
||||
case LLM_TYPE_1_7B: return "1.7B";
|
||||
case LLM_TYPE_1_8B: return "1.8B";
|
||||
case LLM_TYPE_2B: return "2B";
|
||||
case LLM_TYPE_2_8B: return "2.8B";
|
||||
@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_15B: return "15B";
|
||||
case LLM_TYPE_16B: return "16B";
|
||||
case LLM_TYPE_20B: return "20B";
|
||||
case LLM_TYPE_27B: return "27B";
|
||||
case LLM_TYPE_30B: return "30B";
|
||||
case LLM_TYPE_32B: return "32B";
|
||||
case LLM_TYPE_34B: return "34B";
|
||||
@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_65B: return "65B";
|
||||
case LLM_TYPE_70B: return "70B";
|
||||
case LLM_TYPE_236B: return "236B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_314B: return "314B";
|
||||
case LLM_TYPE_671B: return "671B";
|
||||
case LLM_TYPE_SMALL: return "0.1B";
|
||||
@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
||||
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
||||
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
||||
case LLM_TYPE_27B: return "27B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
||||
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
@@ -695,10 +699,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
type = LLM_TYPE_137M;
|
||||
@@ -791,6 +797,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
||||
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
||||
case 40: type = LLM_TYPE_14B; break;
|
||||
case 64: type = LLM_TYPE_32B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -800,6 +810,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 48: type = LLM_TYPE_30B_A3B; break;
|
||||
case 94: type = LLM_TYPE_235B_A22B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -2057,6 +2069,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
} break;
|
||||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
||||
@@ -2090,20 +2103,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
}
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
|
||||
if (arch == LLM_ARCH_BERT) {
|
||||
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
} else {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||
} else {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
}
|
||||
|
||||
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
||||
@@ -5730,6 +5754,11 @@ struct llm_build_bert : public llm_graph_context {
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
@@ -5782,13 +5811,29 @@ struct llm_build_bert : public llm_graph_context {
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
if (model.arch == LLM_ARCH_BERT) {
|
||||
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
||||
// MoE branch
|
||||
cur = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
nullptr,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
hparams.n_expert,
|
||||
hparams.n_expert_used,
|
||||
LLM_FFN_GELU,
|
||||
false, false,
|
||||
0.0f,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
@@ -5796,6 +5841,7 @@ struct llm_build_bert : public llm_graph_context {
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
@@ -5803,8 +5849,8 @@ struct llm_build_bert : public llm_graph_context {
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// attentions bypass the intermediate layer
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
@@ -10149,7 +10195,6 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||
|
||||
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
||||
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
||||
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
|
||||
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
||||
|
||||
// {kv_lora_rank, n_head, n_tokens}
|
||||
@@ -12843,6 +12888,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
||||
} break;
|
||||
@@ -13201,6 +13247,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_DBRX:
|
||||
case LLM_ARCH_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_STABLELM:
|
||||
case LLM_ARCH_BITNET:
|
||||
case LLM_ARCH_QWEN:
|
||||
|
||||
+6
-2
@@ -39,11 +39,13 @@ enum llm_type {
|
||||
LLM_TYPE_770M,
|
||||
LLM_TYPE_780M,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_0_6B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_3B,
|
||||
LLM_TYPE_1_4B,
|
||||
LLM_TYPE_1_5B,
|
||||
LLM_TYPE_1_6B,
|
||||
LLM_TYPE_1_7B,
|
||||
LLM_TYPE_1_8B,
|
||||
LLM_TYPE_2B,
|
||||
LLM_TYPE_2_8B,
|
||||
@@ -62,6 +64,7 @@ enum llm_type {
|
||||
LLM_TYPE_15B,
|
||||
LLM_TYPE_16B,
|
||||
LLM_TYPE_20B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
@@ -70,6 +73,7 @@ enum llm_type {
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
LLM_TYPE_SMALL,
|
||||
@@ -84,10 +88,10 @@ enum llm_type {
|
||||
LLM_TYPE_16x3_8B,
|
||||
LLM_TYPE_10B_128x3_66B,
|
||||
LLM_TYPE_57B_A14B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_17B_16E, // llama4 Scout
|
||||
LLM_TYPE_17B_128E, // llama4 Maverick
|
||||
LLM_TYPE_30B_A3B,
|
||||
LLM_TYPE_235B_A22B,
|
||||
};
|
||||
|
||||
struct llama_layer_posnet {
|
||||
|
||||
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||
// }
|
||||
|
||||
if (k <= 0) {
|
||||
k = cur_p->size;
|
||||
return;
|
||||
}
|
||||
|
||||
k = std::min(k, (int) cur_p->size);
|
||||
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||
}
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
cur_p->size = k;
|
||||
}
|
||||
|
||||
|
||||
@@ -187,14 +187,15 @@ int main(void) {
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "",
|
||||
},
|
||||
{
|
||||
/* .name= */ "GLMEdge",
|
||||
/* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
|
||||
/* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
/* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "",
|
||||
},
|
||||
// TODO @ngxson : GLMEdge produces poor result without `[gMASK]<sop>`, so we're temporarily using GLM4 template for it. We should fix this in the future.
|
||||
// {
|
||||
// /* .name= */ "GLMEdge",
|
||||
// /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
|
||||
// /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
// /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
// /* .bos_token= */ "",
|
||||
// /* .eos_token= */ "",
|
||||
// },
|
||||
{
|
||||
/* .name= */ "MiniCPM-3B-OpenHermes-2.5-v2-GGUF",
|
||||
/* .template_str= */ U8C("{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"),
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user