Compare commits

..

9 Commits

Author SHA1 Message Date
Dou Xinpeng e6b7801bd1 cann: Add host buffer type for Ascend NPU (#9406)
* feat: Add host buffer type for Ascend NPU(CANN backend)

* fix some checking errors

* Add a few comments
2024-09-12 19:46:43 +08:00
fengerhu1 e665744317 llava : fix the script error in MobileVLM README (#9054)
Signed-off-by: Erhu Feng <2748250768@qq.com>
2024-09-12 14:34:22 +03:00
Xuan Son Nguyen d4c3c10fad lora : raise error if lm_head is ignored (#9103)
* lora : raise error if lm_head is ignored

* fix style

* clarify comment
2024-09-12 14:33:57 +03:00
Michael Podvitskiy 2a825116b6 cmake : fix for builds without GGML_CDEF_PUBLIC (#9338)
* `GGML_TARGET_DEFINES-NOTFOUND` fix for builds without `GGML_CDEF_PUBLIC`

* Update CMakeLists.txt, spaces fix
2024-09-12 14:30:01 +03:00
Huang Qi 4dc4f5f14a ci : update HIP SDK to 24.Q3 (ROCm 6.1) (#9329) 2024-09-12 14:28:43 +03:00
daminho c837981bba py : add Phi-1.5/Phi-2 tokenizer (#9361)
* add phi2 tokenizer

* add phi name to convert_hf_to_gguf_update.py

* make tokenizer_pre consistent; llama.cpp work
2024-09-12 14:28:20 +03:00
Trivikram Kamat 3c26a1644d ci : bump actions/checkout to v4 (#9377) 2024-09-12 14:27:45 +03:00
Michael Podvitskiy ff76e18516 cmake : fixed the order of linking libraries for llama-quantize (#9450) 2024-09-12 14:27:14 +03:00
Molly Sophia 39f852f440 py : add special tokens in hf_converter for RWKV v6 (#9428)
Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
2024-09-12 14:25:16 +03:00
10 changed files with 155 additions and 16 deletions
+8 -8
View File
@@ -375,7 +375,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Dependencies
id: depends
@@ -401,7 +401,7 @@ jobs:
continue-on-error: true
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: add oneAPI to apt
shell: bash
@@ -442,7 +442,7 @@ jobs:
continue-on-error: true
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: add oneAPI to apt
shell: bash
@@ -546,7 +546,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v4
- name: Dependencies
id: depends
@@ -576,7 +576,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v4
- name: Dependencies
id: depends
@@ -610,7 +610,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v4
- name: Dependencies
id: depends
@@ -969,14 +969,14 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
+7 -1
View File
@@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
# determining _precisely_ which defines are necessary for the llama-config
# package.
#
set(GGML_TRANSIENT_DEFINES)
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
if (GGML_DIR_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
endif()
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+5
View File
@@ -626,6 +626,9 @@ class Model:
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
res = "exaone"
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
# ref: https://huggingface.co/microsoft/phi-2
res = "phi-2"
if res is None:
logger.warning("\n")
@@ -2771,6 +2774,8 @@ class Rwkv6Model(Model):
self.gguf_writer.add_tokenizer_model("rwkv")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
+1
View File
@@ -98,6 +98,7 @@ models = [
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
]
+7 -1
View File
@@ -363,7 +363,13 @@ if __name__ == '__main__':
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
dest = super().modify_tensors(data_torch, name, bid)
dest = list(super().modify_tensors(data_torch, name, bid))
# some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora
# therefore, we ignore them for now
# see: https://github.com/ggerganov/llama.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest:
assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B()
+5 -5
View File
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert_image_encoder_to_gguf \
python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
```
```sh
python ./examples/llava/convert_image_encoder_to_gguf \
python ./examples/llava/convert_image_encoder_to_gguf.py \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
--output-dir path/to/MobileVLM-1.7B_V2 \
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
```sh
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
+1 -1
View File
@@ -1,6 +1,6 @@
set(TARGET llama-quantize)
add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
+7
View File
@@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
*/
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
/**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
*
* @return A pointer to the host buffer type interface.
*/
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
/**
* @brief Retrieves the description of a specific CANN device.
*
+110
View File
@@ -1221,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
return &ggml_backend_cann_buffer_types[device];
}
/**
* @brief Retrieves the name associated with a CANN host buffer type.
*
* This function returns the descriptive name associated with the specified
* CANN host buffer type context.
*
* @param buft Pointer to the host buffer type context.
* @return Const pointer to the C-style string containing the name.
*/
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
return "CANN_Host";
GGML_UNUSED(buft);
}
/**
* @brief Retrieves the name associated with a CANN host buffer.
*
* This function returns the descriptive name associated with the specified
* CANN host buffer context.
*
* @param buft Pointer to the host buffer context.
* @return Const pointer to the C-style string containing the name.
*/
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
return "CANN_Host";
GGML_UNUSED(buffer);
}
/**
* @brief Free resources associated with a CANN host buffer.
*
* This function frees the resources associated with a CANN host buffer, including
* its context.
*
* @param buffer The CANN host buffer to free.
*/
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
}
/**
* @brief Allocates a new CANN host buffer of the specified size.
*
* This function allocates a new CANN host buffer with the given size.
* @param size Size in bytes of the host buffer to allocate.
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
*/
static void * ggml_cann_host_malloc(size_t size) {
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
return nullptr;
}
void * hostPtr = nullptr;
aclError err = aclrtMallocHost((void **) &hostPtr, size);
if (err != ACL_SUCCESS) {
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
return nullptr;
}
return hostPtr;
}
/**
* @brief Allocates a new CANN host buffer of the specified type and size.
*
* @param buft Pointer to the host buffer type context.
* @param size Size in bytes of the host buffer to allocate.
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
*/
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * hostPtr = ggml_cann_host_malloc(size);
if (hostPtr == nullptr) {
// fallback to cpu buffer
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
return buffer;
}
/**
* @brief Interface for managing CANN host buffer types in the GGML backend.
*
* Provides function pointers for allocating, querying properties, and managing
* memory for CANN buffer types in the GGML backend.
*/
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
/* .iface = */ {
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .context = */ nullptr,
};
return &ggml_backend_cann_buffer_type_host;
}
/**
* @brief Computes the forward operation for a given tensor using CANN
* operations.
+4
View File
@@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type();
}
#elif defined(GGML_USE_CANN)
if (host_buffer) {
buft = ggml_backend_cann_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)