mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-28 00:27:39 +02:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b72942fac9 | |||
| 6afd1a99dc | |||
| 272e3bd95e | |||
| 45a55b91aa | |||
| 3071c0a5f2 | |||
| 4305b57c80 | |||
| 70c0ea3560 | |||
| 5b2c04f492 | |||
| 6f6496bb09 | |||
| daef3ab233 | |||
| 345a686d82 | |||
| 3a14e00366 | |||
| afd27f01fe | |||
| 366d486c16 | |||
| e44a561ab0 | |||
| f93d49ab1e | |||
| 5b33ea1ee7 | |||
| 85fca8deb6 | |||
| ebd541a570 | |||
| 15fa07a5c5 | |||
| be55695eff | |||
| 0478174d59 |
@@ -79,7 +79,6 @@ models-mnt
|
||||
!models/ggml-vocab-*.gguf*
|
||||
|
||||
# Zig
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ BUILD_TARGETS = \
|
||||
llama-imatrix \
|
||||
llama-infill \
|
||||
llama-llava-cli \
|
||||
llama-minicpmv-cli\
|
||||
llama-lookahead \
|
||||
llama-lookup \
|
||||
llama-lookup-create \
|
||||
@@ -888,15 +889,16 @@ ggml/src/ggml-metal-embed.o: \
|
||||
ggml/src/ggml-common.h
|
||||
@echo "Embedding Metal library"
|
||||
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
||||
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
|
||||
@rmdir ${TEMP_ASSEMBLY}
|
||||
endif
|
||||
endif # GGML_METAL
|
||||
|
||||
@@ -1205,6 +1207,7 @@ clean:
|
||||
rm -rvf ggml/*.dll
|
||||
rm -rvf ggml/*.so
|
||||
rm -vrf ggml/src/*.o
|
||||
rm -rvf ggml/src/llamafile/*.o
|
||||
rm -rvf common/build-info.cpp
|
||||
rm -vrf ggml/src/ggml-metal-embed.metal
|
||||
rm -vrf ggml/src/ggml-cuda/*.o
|
||||
@@ -1451,15 +1454,20 @@ libllava.a: examples/llava/llava.cpp \
|
||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||
|
||||
llama-llava-cli: examples/llava/llava-cli.cpp \
|
||||
examples/llava/clip.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
swift: examples/batched.swift
|
||||
|
||||
@@ -1777,6 +1777,17 @@ std::string string_get_sortable_timestamp() {
|
||||
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
||||
}
|
||||
|
||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
}
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
}
|
||||
}
|
||||
|
||||
void string_process_escapes(std::string & input) {
|
||||
std::size_t input_len = input.length();
|
||||
std::size_t output_idx = 0;
|
||||
|
||||
@@ -286,6 +286,8 @@ std::vector<std::string> string_split(std::string input, char separator);
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string string_get_sortable_timestamp();
|
||||
|
||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||
|
||||
template<class T>
|
||||
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||
std::vector<T> values;
|
||||
|
||||
+49
-58
@@ -251,12 +251,7 @@ class Model:
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
||||
del name, new_name, bid, n_dims # unused
|
||||
|
||||
return False
|
||||
|
||||
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
del name, new_name, bid, n_dims # unused
|
||||
|
||||
return False
|
||||
@@ -285,55 +280,47 @@ class Model:
|
||||
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
||||
data: np.ndarray # type hint
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
data_qtype: gguf.GGMLQuantizationType | None = None
|
||||
|
||||
# when both are True, f32 should win
|
||||
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
||||
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
||||
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
||||
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
||||
extra_f32 = any(cond for cond in (
|
||||
extra_f32,
|
||||
n_dims == 1,
|
||||
new_name.endswith("_norm.weight"),
|
||||
))
|
||||
|
||||
# Some tensor types are always in float32
|
||||
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
|
||||
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||
gguf.MODEL_TENSOR.POS_EMBD,
|
||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||
))
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
extra_f16 = any(cond for cond in (
|
||||
extra_f16,
|
||||
(name.endswith(".weight") and n_dims >= 2),
|
||||
))
|
||||
|
||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
data = gguf.quantize_bf16(data)
|
||||
assert data.dtype == np.uint16
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
||||
data = gguf.quantize_q8_0(data)
|
||||
assert data.dtype == np.uint8
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
|
||||
else: # default to float16 for quantized tensors
|
||||
if data_dtype != np.float16:
|
||||
data = data.astype(np.float16)
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
|
||||
if data_qtype is None: # by default, convert to float32
|
||||
if data_dtype != np.float32:
|
||||
data = data.astype(np.float32)
|
||||
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
||||
# Some tensor types are always in float32
|
||||
if data_qtype is False and (
|
||||
any(
|
||||
self.match_model_tensor_name(new_name, key, bid)
|
||||
for key in (
|
||||
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||
gguf.MODEL_TENSOR.POS_EMBD,
|
||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||
)
|
||||
)
|
||||
or not name.endswith(".weight")
|
||||
):
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||
if isinstance(data_qtype, bool):
|
||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
else:
|
||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||
|
||||
try:
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
except gguf.QuantError as e:
|
||||
logger.warning("%s, %s", e, "falling back to F16")
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
|
||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
@@ -1765,7 +1752,7 @@ class DbrxModel(Model):
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
del name, new_name, bid # unused
|
||||
|
||||
return n_dims > 1
|
||||
@@ -2786,18 +2773,22 @@ class MambaModel(Model):
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
||||
del n_dims # unused
|
||||
|
||||
return bid is not None and new_name in (
|
||||
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
if bid is not None and new_name in (
|
||||
self.format_tensor_name(
|
||||
n, bid, ".weight" if name.endswith(".weight") else ""
|
||||
)
|
||||
for n in [
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||
gguf.MODEL_TENSOR.SSM_X,
|
||||
gguf.MODEL_TENSOR.SSM_DT,
|
||||
gguf.MODEL_TENSOR.SSM_A,
|
||||
gguf.MODEL_TENSOR.SSM_D,
|
||||
]
|
||||
)
|
||||
):
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@Model.register("CohereForCausalLM")
|
||||
|
||||
@@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||
./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
|
||||
```
|
||||
|
||||
### Windows:
|
||||
|
||||
```powershell
|
||||
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||
llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
|
||||
```
|
||||
|
||||
The above command will output space-separated float values.
|
||||
@@ -50,11 +50,11 @@ The above command will output space-separated float values.
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||
./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||
```
|
||||
|
||||
### Windows:
|
||||
|
||||
```powershell
|
||||
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||
llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||
```
|
||||
|
||||
@@ -50,20 +50,6 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context
|
||||
return ctx_gguf;
|
||||
}
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
struct file_input {
|
||||
struct ggml_context * ctx_meta = nullptr;
|
||||
struct gguf_context * ctx_gguf = nullptr;
|
||||
|
||||
@@ -36,3 +36,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TARGET llama-minicpmv-cli)
|
||||
add_executable(${TARGET} minicpmv-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
## MiniCPM-Llama3-V 2.5
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
|
||||
python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
Build for Linux or Mac
|
||||
|
||||
```bash
|
||||
make
|
||||
make llama-minicpmv-cli
|
||||
```
|
||||
|
||||
Inference on Linux or Mac
|
||||
```
|
||||
# run f16 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# or run in interactive mode
|
||||
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
||||
```
|
||||
|
||||
### Android
|
||||
|
||||
#### Build on Android device using Termux
|
||||
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
|
||||
|
||||
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
|
||||
|
||||
Install tools in Termux:
|
||||
```
|
||||
apt update && apt upgrade -y
|
||||
apt install git make cmake
|
||||
```
|
||||
|
||||
It's recommended to move your model inside the `~/` directory for best performance:
|
||||
```
|
||||
cd storage/downloads
|
||||
mv model.gguf ~/
|
||||
```
|
||||
|
||||
#### Building the Project using Android NDK
|
||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
||||
|
||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
||||
|
||||
```bash
|
||||
mkdir build-android
|
||||
cd build-android
|
||||
export NDK=/your_ndk_path
|
||||
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||
make
|
||||
```
|
||||
|
||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
||||
|
||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
||||
|
||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
||||
```
|
||||
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
|
||||
$cd /data/data/com.termux/files/home/bin
|
||||
$chmod +x ./*
|
||||
```
|
||||
|
||||
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
|
||||
|
||||
```
|
||||
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
|
||||
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
|
||||
```
|
||||
|
||||
Now, you can start chatting:
|
||||
```
|
||||
$cd /data/data/com.termux/files/home/bin
|
||||
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
```
|
||||
+553
-80
@@ -74,26 +74,27 @@ static std::string format(const char * fmt, ...) {
|
||||
// key constants
|
||||
//
|
||||
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
||||
#define KEY_N_POSITIONS "clip.text.context_length"
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
||||
#define KEY_N_POSITIONS "clip.text.context_length"
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
@@ -127,12 +128,20 @@ static std::string format(const char * fmt, ...) {
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
#define TN_MINICPMV_QUERY "resampler.query"
|
||||
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
||||
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
||||
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
||||
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
||||
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -140,6 +149,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
};
|
||||
|
||||
|
||||
@@ -200,17 +210,14 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
||||
}
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
}
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
@@ -492,12 +499,33 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * mm_model_mlp_2_b;
|
||||
struct ggml_tensor * mm_model_peg_0_w;
|
||||
struct ggml_tensor * mm_model_peg_0_b;
|
||||
|
||||
// MINICPMV projection
|
||||
struct ggml_tensor * mm_model_pos_embed_k;
|
||||
struct ggml_tensor * mm_model_query;
|
||||
struct ggml_tensor * mm_model_proj;
|
||||
struct ggml_tensor * mm_model_kv_proj;
|
||||
struct ggml_tensor * mm_model_attn_q_w;
|
||||
struct ggml_tensor * mm_model_attn_q_b;
|
||||
struct ggml_tensor * mm_model_attn_k_w;
|
||||
struct ggml_tensor * mm_model_attn_k_b;
|
||||
struct ggml_tensor * mm_model_attn_v_w;
|
||||
struct ggml_tensor * mm_model_attn_v_b;
|
||||
struct ggml_tensor * mm_model_attn_o_w;
|
||||
struct ggml_tensor * mm_model_attn_o_b;
|
||||
struct ggml_tensor * mm_model_ln_q_w;
|
||||
struct ggml_tensor * mm_model_ln_q_b;
|
||||
struct ggml_tensor * mm_model_ln_kv_w;
|
||||
struct ggml_tensor * mm_model_ln_kv_b;
|
||||
struct ggml_tensor * mm_model_ln_post_w;
|
||||
struct ggml_tensor * mm_model_ln_post_b;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
bool has_minicpmv_projector = false;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
@@ -522,9 +550,11 @@ struct clip_ctx {
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
|
||||
struct clip_image_size * load_image_size;
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
@@ -533,20 +563,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
const int image_size = hparams.image_size;
|
||||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
if (load_image_size == nullptr) {
|
||||
load_image_size = clip_image_size_init();
|
||||
}
|
||||
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
image_size_width = load_image_size->width;
|
||||
image_size_height = load_image_size->height;
|
||||
if (is_inf) {
|
||||
image_size_width = imgs->data->nx;
|
||||
image_size_height = imgs->data->ny;
|
||||
}
|
||||
}
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
const int n_layer = hparams.n_layer;
|
||||
int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
|
||||
const int batch_size = imgs->size;
|
||||
|
||||
if (ctx->has_llava_projector) {
|
||||
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
|
||||
@@ -559,7 +602,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
|
||||
@@ -572,19 +615,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
}
|
||||
|
||||
// concat class_embeddings and patch_embeddings
|
||||
struct ggml_tensor * embeddings = inp;
|
||||
if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
ggml_set_name(embeddings, "embeddings");
|
||||
ggml_set_input(embeddings);
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
struct ggml_tensor * pos_embed = nullptr;
|
||||
|
||||
if (ctx->has_llava_projector) {
|
||||
// concat class_embeddings and patch_embeddings
|
||||
if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
ggml_set_name(embeddings, "embeddings");
|
||||
ggml_set_input(embeddings);
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_set_name(positions, "positions");
|
||||
@@ -593,6 +638,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
embeddings =
|
||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
int pos_w = image_size_width/patch_size;
|
||||
int pos_h = image_size_height/patch_size;
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
|
||||
ggml_set_name(pos_embed, "pos_embed");
|
||||
ggml_set_input(pos_embed);
|
||||
}
|
||||
|
||||
// pre-layernorm
|
||||
if (ctx->has_pre_norm) {
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
@@ -602,6 +655,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
n_layer += 1;
|
||||
}
|
||||
for (int il = 0; il < n_layer - 1; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
||||
@@ -691,7 +747,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
}
|
||||
|
||||
// llava projector
|
||||
{
|
||||
if (ctx->has_llava_projector) {
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
||||
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
||||
@@ -872,6 +928,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// minicpmv projector
|
||||
else if (ctx->has_minicpmv_projector)
|
||||
{
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||
struct ggml_tensor * q = model.mm_model_query;
|
||||
{ // layernorm
|
||||
q = ggml_norm(ctx0, q, eps);
|
||||
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||
}
|
||||
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||
{ // layernorm
|
||||
v = ggml_norm(ctx0, v, eps);
|
||||
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
||||
}
|
||||
struct ggml_tensor * k;
|
||||
{ // position
|
||||
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
|
||||
k = ggml_add(ctx0, v, pos_embed);
|
||||
}
|
||||
|
||||
{ // attention
|
||||
const int hidden_size = 4096;
|
||||
const int d_head = 128;
|
||||
const int n_head = hidden_size/d_head;
|
||||
const int num_query = 96;
|
||||
|
||||
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
||||
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
||||
// permute
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
||||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
||||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
||||
}
|
||||
{ // layernorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
||||
}
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
@@ -1029,7 +1144,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
|
||||
}
|
||||
|
||||
GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
||||
idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
|
||||
if (idx != -1) {
|
||||
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
|
||||
}
|
||||
|
||||
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
||||
|
||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||
GGML_ASSERT(!new_clip->has_text_encoder);
|
||||
|
||||
@@ -1040,6 +1161,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
}
|
||||
@@ -1281,6 +1403,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
||||
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
||||
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
|
||||
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
|
||||
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
|
||||
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
|
||||
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
|
||||
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
|
||||
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
|
||||
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
|
||||
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
|
||||
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
|
||||
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
|
||||
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
|
||||
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
|
||||
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
|
||||
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
|
||||
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
|
||||
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
||||
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
@@ -1319,7 +1462,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
@@ -1328,6 +1471,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
return new_clip;
|
||||
}
|
||||
|
||||
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
||||
ctx_clip->load_image_size = load_image_size;
|
||||
}
|
||||
|
||||
struct clip_image_size * clip_image_size_init() {
|
||||
struct clip_image_size * load_image_size = new struct clip_image_size();
|
||||
load_image_size->width = 448;
|
||||
load_image_size->height = 448;
|
||||
return load_image_size;
|
||||
}
|
||||
|
||||
struct clip_image_u8 * clip_image_u8_init() {
|
||||
return new clip_image_u8();
|
||||
}
|
||||
@@ -1598,9 +1752,184 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
||||
return patches;
|
||||
}
|
||||
|
||||
static int ensure_divide(int length, int patch_size) {
|
||||
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||
int width = original_size.first;
|
||||
int height = original_size.second;
|
||||
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
||||
float r = static_cast<float>(width) / height;
|
||||
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
||||
width = static_cast<int>(height * r);
|
||||
}
|
||||
int best_width = ensure_divide(width, patch_size);
|
||||
int best_height = ensure_divide(height, patch_size);
|
||||
return std::make_pair(best_width, best_height);
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||
int width, height;
|
||||
std::tie(width, height) = original_size;
|
||||
int grid_x, grid_y;
|
||||
std::tie(grid_x, grid_y) = grid;
|
||||
|
||||
int refine_width = ensure_divide(width, grid_x);
|
||||
int refine_height = ensure_divide(height, grid_y);
|
||||
|
||||
int grid_width = refine_width / grid_x;
|
||||
int grid_height = refine_height / grid_y;
|
||||
|
||||
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
||||
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
||||
int best_grid_width, best_grid_height;
|
||||
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
||||
|
||||
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
||||
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
||||
return refine_size;
|
||||
}
|
||||
|
||||
inline int clip(int x, int lower, int upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||
std::vector<int> candidate_split_grids_nums;
|
||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||
if (i == 1 || i > max_slice_nums) {
|
||||
continue;
|
||||
}
|
||||
candidate_split_grids_nums.push_back(i);
|
||||
}
|
||||
|
||||
std::vector<std::pair<int, int>> candidate_grids;
|
||||
for (int split_grids_nums : candidate_split_grids_nums) {
|
||||
int m = 1;
|
||||
while (m <= split_grids_nums) {
|
||||
if (split_grids_nums % m == 0) {
|
||||
candidate_grids.emplace_back(m, split_grids_nums / m);
|
||||
}
|
||||
++m;
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<int, int> best_grid{1, 1};
|
||||
float min_error = std::numeric_limits<float>::infinity();
|
||||
for (const auto& grid : candidate_grids) {
|
||||
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
||||
if (error < min_error) {
|
||||
best_grid = grid;
|
||||
min_error = error;
|
||||
}
|
||||
}
|
||||
return best_grid;
|
||||
}
|
||||
|
||||
// inspired from LLaVA-UHD:
|
||||
// -> https://arxiv.org/pdf/2403.11703
|
||||
// -> https://github.com/thunlp/LLaVA-UHD
|
||||
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
|
||||
const std::pair<int, int> original_size={img->nx,img->ny};
|
||||
const int original_width = img->nx;
|
||||
const int original_height = img->ny;
|
||||
const float log_ratio = log(1.0*original_width/original_height);
|
||||
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||
|
||||
std::vector<std::vector<clip_image_u8 *>> images;
|
||||
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
||||
images.push_back(std::vector<clip_image_u8 *>());
|
||||
|
||||
if (multiple <= 1) {
|
||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
||||
clip_image_u8 * source_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
||||
images[images.size()-1].push_back(source_image);
|
||||
}
|
||||
else if (multiple > 1) {
|
||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
||||
clip_image_u8 * source_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||
images[images.size()-1].push_back(source_image);
|
||||
|
||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||
|
||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||
clip_image_u8 * refine_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||
|
||||
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||
|
||||
// split_to_patches
|
||||
int width = refine_image->nx;
|
||||
int height = refine_image->ny;
|
||||
int grid_x = int(width / best_grid.first);
|
||||
int grid_y = int(height / best_grid.second);
|
||||
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
||||
images.push_back(std::vector<clip_image_u8 *>());
|
||||
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
||||
clip_image_u8 * patch = clip_image_u8_init();
|
||||
patch->nx = grid_x;
|
||||
patch->ny = grid_y;
|
||||
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
||||
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
||||
const int i = 3 * (y * refine_image->nx + x);
|
||||
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
||||
patch->buf[j] = refine_image->buf[i];
|
||||
patch->buf[j+1] = refine_image->buf[i+1];
|
||||
patch->buf[j+2] = refine_image->buf[i+2];
|
||||
}
|
||||
}
|
||||
images[images.size()-1].push_back(patch);
|
||||
}
|
||||
}
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
|
||||
const int max_slice_nums=9;
|
||||
const int scale_resolution=448;
|
||||
const int original_width = ctx_clip->load_image_size->width;
|
||||
const int original_height = ctx_clip->load_image_size->height;
|
||||
const float log_ratio = log(1.0*original_width/original_height);
|
||||
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||
return best_grid.first;
|
||||
}
|
||||
|
||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||
if (clip_is_minicpmv(ctx)) {
|
||||
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
|
||||
res_imgs->size = 0;
|
||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||
res_imgs->size += imgs[i].size();
|
||||
}
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
int idx = 0;
|
||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||
clip_image_f32 * res = clip_image_f32_init();
|
||||
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
||||
res_imgs->data[idx++] = *res;
|
||||
clip_image_f32_free(res);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
@@ -1816,11 +2145,99 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
||||
n_patches /= 4;
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||
n_patches = 96;
|
||||
}
|
||||
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
||||
assert(embed_dim % 2 == 0);
|
||||
int H = pos.size();
|
||||
int W = pos[0].size();
|
||||
|
||||
std::vector<float> omega(embed_dim / 2);
|
||||
for (int i = 0; i < embed_dim / 2; ++i) {
|
||||
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||
float out_value = pos[h][w] * omega[d];
|
||||
emb[h][w][d] = sin(out_value);
|
||||
emb[h][w][d + embed_dim / 2] = cos(out_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return emb;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
|
||||
assert(embed_dim % 2 == 0);
|
||||
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
|
||||
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
|
||||
|
||||
int H = emb_h.size();
|
||||
int W = emb_h[0].size();
|
||||
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||
emb[h][w][d] = emb_h[h][w][d];
|
||||
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
|
||||
}
|
||||
}
|
||||
}
|
||||
return emb;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
|
||||
int grid_h_size = image_size.first;
|
||||
int grid_w_size = image_size.second;
|
||||
|
||||
std::vector<float> grid_h(grid_h_size);
|
||||
std::vector<float> grid_w(grid_w_size);
|
||||
|
||||
for (int i = 0; i < grid_h_size; ++i) {
|
||||
grid_h[i] = static_cast<float>(i);
|
||||
}
|
||||
for (int i = 0; i < grid_w_size; ++i) {
|
||||
grid_w[i] = static_cast<float>(i);
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
|
||||
for (int h = 0; h < grid_h_size; ++h) {
|
||||
for (int w = 0; w < grid_w_size; ++w) {
|
||||
grid[h][w] = grid_w[w];
|
||||
}
|
||||
}
|
||||
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
|
||||
for (int h = 0; h < grid_h_size; ++h) {
|
||||
for (int w = 0; w < grid_w_size; ++w) {
|
||||
grid_2d[0][h][w] = grid_h[h];
|
||||
grid_2d[1][h][w] = grid_w[w];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
|
||||
|
||||
int H = image_size.first;
|
||||
int W = image_size.second;
|
||||
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
|
||||
}
|
||||
}
|
||||
|
||||
return pos_embed_2d;
|
||||
}
|
||||
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
@@ -1843,18 +2260,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
if (ctx->has_llava_projector) {
|
||||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||
}
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
|
||||
// set inputs
|
||||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
const int image_size = hparams.image_size;
|
||||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
image_size_width = imgs->data[0].nx;
|
||||
image_size_height = imgs->data[0].ny;
|
||||
}
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
|
||||
{
|
||||
@@ -1864,7 +2290,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
for (size_t i = 0; i < imgs->size; i++) {
|
||||
const int nx = imgs->data[i].nx;
|
||||
const int ny = imgs->data[i].ny;
|
||||
GGML_ASSERT(nx == image_size && ny == image_size);
|
||||
if (!ctx->has_minicpmv_projector) {
|
||||
GGML_ASSERT(nx == image_size && ny == image_size);
|
||||
}
|
||||
|
||||
const int n = nx * ny;
|
||||
|
||||
@@ -1881,37 +2309,75 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||
free(data);
|
||||
}
|
||||
|
||||
{
|
||||
if (ctx->has_class_embedding) {
|
||||
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||
|
||||
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
||||
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
||||
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
||||
free(zero_mem);
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
{
|
||||
// inspired from siglip:
|
||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
positions_data[i] = std::floor(70.0*i/num_positions);
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
{
|
||||
// inspired from resampler of Qwen-VL:
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
||||
if(ctx->load_image_size==nullptr){
|
||||
ctx->load_image_size= clip_image_size_init();
|
||||
}
|
||||
int pos_w = ctx->load_image_size->width/patch_size;
|
||||
int pos_h = ctx->load_image_size->height/patch_size;
|
||||
int embed_dim = 4096;
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
positions_data[i] = i;
|
||||
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
||||
for(int i=0;i<pos_w * pos_h;++i){
|
||||
for(int j=0;j<embed_dim;++j){
|
||||
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
|
||||
free(pos_embed_data);
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
} else {
|
||||
{
|
||||
if (ctx->has_class_embedding) {
|
||||
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||
|
||||
{
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
patches_data[i] = i + 1;
|
||||
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
||||
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
||||
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
||||
free(zero_mem);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
for (int i = 0; i < num_positions; i++) {
|
||||
positions_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
patches_data[i] = i + 1;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
}
|
||||
|
||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||
@@ -2081,7 +2547,14 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
return ctx->vision_model.mm_3_b->ne[0];
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||
return 4096;
|
||||
}
|
||||
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
|
||||
bool clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
return ctx->has_minicpmv_projector;
|
||||
}
|
||||
|
||||
+11
-2
@@ -18,14 +18,17 @@
|
||||
# define CLIP_API
|
||||
#endif
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
struct clip_image_size {
|
||||
int width;
|
||||
int height;
|
||||
};
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
struct clip_image_u8 * data;
|
||||
size_t size;
|
||||
@@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
|
||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
|
||||
@@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
||||
|
||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||
|
||||
CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -202,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
return true;
|
||||
}
|
||||
|
||||
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
||||
int width = image->nx;
|
||||
int height = image->ny;
|
||||
int num_patches = (height / patch_size) * (width / patch_size);
|
||||
clip_image_f32 * patch = clip_image_f32_init();
|
||||
patch->nx = patch_size * num_patches;
|
||||
patch->ny = patch_size;
|
||||
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||
|
||||
int patch_index = 0;
|
||||
|
||||
for (int i = 0; i < height; i += patch_size) {
|
||||
for (int j = 0; j < width; j += patch_size) {
|
||||
for (int pi = 0; pi < patch_size; ++pi) {
|
||||
for (int pj = 0; pj < patch_size; ++pj) {
|
||||
int input_index = ((i + pi) * width + (j + pj)) * 3;
|
||||
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
|
||||
patch->buf[output_index] = image->buf[input_index];
|
||||
patch->buf[output_index+1] = image->buf[input_index+1];
|
||||
patch->buf[output_index+2] = image->buf[input_index+2];
|
||||
}
|
||||
}
|
||||
patch_index++;
|
||||
}
|
||||
}
|
||||
return patch;
|
||||
}
|
||||
|
||||
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
||||
@@ -218,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
|
||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
||||
|
||||
if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||
int patch_size=14;
|
||||
load_image_size->width = img_res_v.data[i].nx;
|
||||
load_image_size->height = img_res_v.data[i].ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
if (!encoded) {
|
||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
int n_img_pos_out = 0;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
||||
n_img_pos_out += clip_n_patches(ctx_clip);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
load_image_size->width = img->nx;
|
||||
load_image_size->height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||
@@ -228,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
// spatial_unpad llava-1.6 type embedding
|
||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
||||
std::vector<float *> image_embd_v;
|
||||
@@ -297,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
||||
}
|
||||
|
||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||
int num_max_patches = 6;
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
num_max_patches = 10;
|
||||
}
|
||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||
if (!image_embd) {
|
||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||
return false;
|
||||
|
||||
@@ -17,12 +17,11 @@
|
||||
# define LLAVA_API
|
||||
#endif
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct clip_ctx;
|
||||
struct llava_image_embed {
|
||||
float * embed;
|
||||
int n_image_pos;
|
||||
@@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||
/** build an image embed from a path to an image filename */
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
||||
/** free an embedding made with llava_image_embed_make_* */
|
||||
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
||||
|
||||
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
||||
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
||||
|
||||
@@ -0,0 +1,309 @@
|
||||
#include "ggml.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(gpt_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||
if (params->n_ctx < 2048) {
|
||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||
ctx_params.n_ctx = 2048;
|
||||
} else {
|
||||
ctx_params.n_ctx = params->n_ctx;
|
||||
}
|
||||
|
||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_free_model(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
static struct clip_ctx * clip_init_context(gpt_params * params) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
||||
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
}
|
||||
|
||||
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
|
||||
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||
slice_embed->embed = image_embed;
|
||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||
llava_image_embed_free(slice_embed);
|
||||
}
|
||||
|
||||
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
|
||||
std::string system_prompt;
|
||||
int idx = 0;
|
||||
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (num_image_embeds > 1) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
}
|
||||
|
||||
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
||||
static std::string ret;
|
||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = llama_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||
auto ctx_clip = clip_init_context(params);
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
if (params->prompt.empty() && params->interactive == false) {
|
||||
LOG_TEE("prompt should be given or interactive mode should be on");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto model = llava_init(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
auto ctx_llava = llava_init_context(params, model);
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
const int64_t t_process_image_start_us = ggml_time_us();
|
||||
process_image(ctx_llava, embeds, params, n_past);
|
||||
const int64_t t_process_image_end_us = ggml_time_us();
|
||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
llava_image_embed_free(embeds);
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
||||
std::string user_prompt = prompt;
|
||||
if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
||||
// generate the response
|
||||
|
||||
LOG_TEE("\n");
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||
return ctx_sampling;
|
||||
}
|
||||
|
||||
static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
|
||||
|
||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||
return tmp;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("llava", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty())) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (auto & image : params.image) {
|
||||
int n_past = 0;
|
||||
auto ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
LOG_TEE("<user>%s\n", params.prompt.c_str());
|
||||
LOG_TEE("<assistant>");
|
||||
auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response = "";
|
||||
bool have_tmp = false;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0){
|
||||
if(!have_tmp)continue;
|
||||
else break;
|
||||
}
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
have_tmp = true;
|
||||
printf("%s", tmp);
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
llama_sampling_free(ctx_sampling);
|
||||
}else {
|
||||
while (true) {
|
||||
LOG_TEE("<user>");
|
||||
std::string prompt;
|
||||
std::getline(std::cin, prompt);
|
||||
LOG_TEE("<assistant>");
|
||||
auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response = "";
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
printf("%s", tmp);// mistral llava-1.6
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
fflush(stdout);
|
||||
}
|
||||
llama_sampling_free(ctx_sampling);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
llama_print_timings(ctx_llava->ctx_llama);
|
||||
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,382 @@
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
|
||||
|
||||
def add_key_str(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
|
||||
if name in (
|
||||
"logit_scale",
|
||||
"text_model.embeddings.position_ids",
|
||||
"vision_model.embeddings.position_ids",
|
||||
):
|
||||
return True
|
||||
|
||||
if has_minicpmv and name in ["visual_projection.weight"]:
|
||||
return True
|
||||
|
||||
if name.startswith("v") and not has_vision:
|
||||
return True
|
||||
|
||||
if name.startswith("t") and not has_text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_tensor_name(name: str) -> str:
|
||||
if "projection" in name:
|
||||
return name
|
||||
if "mm_projector" in name:
|
||||
name = name.replace("model.mm_projector", "mm")
|
||||
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||
return name
|
||||
|
||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
|
||||
|
||||
if args.text_only and args.vision_only:
|
||||
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
||||
exit(1)
|
||||
|
||||
if args.use_f32:
|
||||
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||
|
||||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
tokens = [key for key in vocab]
|
||||
|
||||
# possible data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
#
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
# if args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||
# model = CLIPVisionModel.from_pretrained(dir_model)
|
||||
# processor = None
|
||||
# else:
|
||||
# model = CLIPModel.from_pretrained(dir_model)
|
||||
# processor = CLIPProcessor.from_pretrained(dir_model)
|
||||
|
||||
default_vision_config = {
|
||||
"hidden_size": 1152,
|
||||
"image_size": 980,
|
||||
"intermediate_size": 4304,
|
||||
"model_type": "idefics2",
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 27,
|
||||
"patch_size": 14,
|
||||
}
|
||||
vision_config = Idefics2VisionConfig(**default_vision_config)
|
||||
model = Idefics2VisionTransformer(vision_config)
|
||||
|
||||
processor = None
|
||||
# if model.attn_pool is not None:
|
||||
# model.attn_pool = torch.nn.Identity()
|
||||
|
||||
# model.blocks = model.blocks[:-1]
|
||||
model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
|
||||
|
||||
fname_middle = None
|
||||
has_text_encoder = True
|
||||
has_vision_encoder = True
|
||||
has_minicpmv_projector = False
|
||||
if args.text_only:
|
||||
fname_middle = "text-"
|
||||
has_vision_encoder = False
|
||||
elif args.minicpmv_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_minicpmv_projector = True
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
else:
|
||||
fname_middle = ""
|
||||
|
||||
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||
fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
|
||||
fout.add_file_type(ftype)
|
||||
if args.text_only:
|
||||
fout.add_description("text-only CLIP model")
|
||||
elif args.vision_only and not has_minicpmv_projector:
|
||||
fout.add_description("vision-only CLIP model")
|
||||
elif has_minicpmv_projector:
|
||||
fout.add_description("image encoder for MiniCPM-V")
|
||||
# add projector type
|
||||
fout.add_string("clip.projector_type", "resampler")
|
||||
else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
if has_vision_encoder:
|
||||
# vision_model hparams
|
||||
fout.add_uint32("clip.vision.image_size", 448)
|
||||
fout.add_uint32("clip.vision.patch_size", 14)
|
||||
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
|
||||
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
|
||||
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
block_count = 26
|
||||
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
|
||||
if processor is not None:
|
||||
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||
else:
|
||||
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
use_gelu = True
|
||||
fout.add_bool("clip.use_gelu", use_gelu)
|
||||
|
||||
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
||||
"""
|
||||
embed_dim: output dimension for each position
|
||||
pos: a list of positions to be encoded: size (M,)
|
||||
out: (M, D)
|
||||
"""
|
||||
assert embed_dim % 2 == 0
|
||||
omega = np.arange(embed_dim // 2, dtype=np.float32)
|
||||
omega /= embed_dim / 2.
|
||||
omega = 1. / 10000 ** omega # (D/2,)
|
||||
|
||||
pos = pos.reshape(-1) # (M,)
|
||||
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
||||
|
||||
emb_sin = np.sin(out) # (M, D/2)
|
||||
emb_cos = np.cos(out) # (M, D/2)
|
||||
|
||||
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
||||
return emb
|
||||
|
||||
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
|
||||
assert embed_dim % 2 == 0
|
||||
|
||||
# use half of dimensions to encode grid_h
|
||||
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
|
||||
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
|
||||
|
||||
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
|
||||
return emb
|
||||
|
||||
|
||||
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
|
||||
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
|
||||
"""
|
||||
grid_size: int of the grid height and width
|
||||
return:
|
||||
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
|
||||
"""
|
||||
if isinstance(grid_size, int):
|
||||
grid_h_size, grid_w_size = grid_size, grid_size
|
||||
else:
|
||||
grid_h_size, grid_w_size = grid_size[0], grid_size[1]
|
||||
|
||||
grid_h = np.arange(grid_h_size, dtype=np.float32)
|
||||
grid_w = np.arange(grid_w_size, dtype=np.float32)
|
||||
grid = np.meshgrid(grid_w, grid_h) # here w goes first
|
||||
grid = np.stack(grid, axis=0)
|
||||
|
||||
grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
|
||||
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
|
||||
if cls_token:
|
||||
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
|
||||
return pos_embed
|
||||
|
||||
def _replace_name_resampler(s, v):
|
||||
if re.match("resampler.pos_embed", s):
|
||||
return {
|
||||
s: v,
|
||||
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
|
||||
}
|
||||
if re.match("resampler.proj", s):
|
||||
return {
|
||||
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
|
||||
re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
|
||||
}
|
||||
if re.match("resampler.attn.in_proj_.*", s):
|
||||
return {
|
||||
re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
|
||||
re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
|
||||
re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
|
||||
}
|
||||
return {s: v}
|
||||
|
||||
if has_minicpmv_projector:
|
||||
projector = torch.load(args.minicpmv_projector)
|
||||
new_state_dict = {}
|
||||
for k, v in projector.items():
|
||||
kvs = _replace_name_resampler(k, v)
|
||||
for nk, nv in kvs.items():
|
||||
new_state_dict[nk] = nv
|
||||
projector = new_state_dict
|
||||
ftype_cur = 0
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
if ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
|
||||
fout.add_tensor(name, data)
|
||||
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
|
||||
print("Projector tensors added\n")
|
||||
|
||||
def _replace_name(s, v):
|
||||
s = "vision_model." + s
|
||||
if re.match("vision_model.embeddings.position_embedding", s):
|
||||
v = v.unsqueeze(0)
|
||||
return {s: v}
|
||||
|
||||
return {s: v}
|
||||
|
||||
state_dict = model.state_dict()
|
||||
new_state_dict = {}
|
||||
for k, v in state_dict.items():
|
||||
kvs = _replace_name(k, v)
|
||||
for nk, nv in kvs.items():
|
||||
new_state_dict[nk] = nv
|
||||
state_dict = new_state_dict
|
||||
for name, data in state_dict.items():
|
||||
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
|
||||
# we don't need this
|
||||
print(f"skipping parameter: {name}")
|
||||
continue
|
||||
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
|
||||
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||
ftype_cur = 0
|
||||
if n_dims == 4:
|
||||
print(f"tensor {name} is always saved in f16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
elif ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
|
||||
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
||||
@@ -0,0 +1,47 @@
|
||||
import argparse
|
||||
import os
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model")
|
||||
args = ap.parse_args()
|
||||
|
||||
# find the model part that includes the the multimodal projector weights
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
|
||||
checkpoint = model.state_dict()
|
||||
|
||||
# get a list of mm tensor names
|
||||
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
torch.save(projector, f"{args.model}/minicpmv.projector")
|
||||
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
|
||||
if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/minicpmv.clip")
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||
f.write("{}\n")
|
||||
|
||||
config = model.llm.config
|
||||
config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5"
|
||||
config.auto_map = {
|
||||
"AutoConfig": "configuration_minicpm.MiniCPMConfig",
|
||||
"AutoModel": "modeling_minicpm.MiniCPMModel",
|
||||
"AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
|
||||
"AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
|
||||
"AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
|
||||
}
|
||||
model.llm.save_pretrained(f"{args.model}/model")
|
||||
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
|
||||
tok.save_pretrained(f"{args.model}/model")
|
||||
# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py")
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
|
||||
@@ -2,3 +2,4 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pillow~=10.2.0
|
||||
torch~=2.2.1
|
||||
torchvision==0.17.1
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
## Overview
|
||||
|
||||
> [!IMPORTANT]
|
||||
> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
|
||||
> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
|
||||
|
||||
The `rpc-server` allows running `ggml` backend on a remote host.
|
||||
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
||||
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
struct rpc_server_params {
|
||||
std::string host = "0.0.0.0";
|
||||
std::string host = "127.0.0.1";
|
||||
int port = 50052;
|
||||
size_t backend_mem = 0;
|
||||
};
|
||||
@@ -114,6 +114,17 @@ int main(int argc, char * argv[]) {
|
||||
fprintf(stderr, "Invalid parameters\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.host != "127.0.0.1") {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
|
||||
fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
|
||||
fprintf(stderr, " Never expose the RPC server to an open network!\n");
|
||||
fprintf(stderr, " This is an experimental feature and is not secure!\n");
|
||||
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
ggml_backend_t backend = create_backend();
|
||||
if (!backend) {
|
||||
fprintf(stderr, "Failed to create backend\n");
|
||||
|
||||
@@ -975,6 +975,8 @@ struct server_context {
|
||||
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
|
||||
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
|
||||
slot.prompt = *prompt;
|
||||
} else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) {
|
||||
slot.prompt = prompt->at(0);
|
||||
} else {
|
||||
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
|
||||
+9
-15
@@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
|
||||
|
||||
List all SYCL devices with ID, compute capability, max work group size, ect.
|
||||
|
||||
1. Build the llama.cpp for SYCL for all targets.
|
||||
1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
@@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
found 2 SYCL devices:
|
||||
| | | | |Max | |Max |Global | |
|
||||
| | | | |compute|Max work|sub |mem | |
|
||||
|ID| Device Type| Name|Version|units |group |group|size | Driver version|
|
||||
|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
|
||||
| 0| [level_zero:gpu:0]| Intel Arc A770 Graphics| 1.3| 512| 1024| 32| 16225M| 1.3.29138|
|
||||
| 1| [level_zero:gpu:1]| Intel UHD Graphics 750| 1.3| 32| 512| 32| 62631M| 1.3.29138|
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
@@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void
|
||||
|
||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||
|
||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#elif defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
+15
-10
@@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
||||
}
|
||||
|
||||
// an async copy would normally happen after all the queued operations on both backends are completed
|
||||
// sync src, set_async dst
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
ggml_backend_synchronize(backend_src);
|
||||
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
|
||||
} else {
|
||||
ggml_backend_synchronize(backend_src);
|
||||
ggml_backend_tensor_copy(src, dst);
|
||||
ggml_backend_synchronize(backend_dst);
|
||||
}
|
||||
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
||||
ggml_backend_synchronize(backend_src);
|
||||
ggml_backend_synchronize(backend_dst);
|
||||
ggml_backend_tensor_copy(src, dst);
|
||||
}
|
||||
|
||||
// events
|
||||
@@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||
ggml_backend_synchronize(input_backend);
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+16
-14
@@ -2358,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
||||
|
||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
||||
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// device -> device
|
||||
// device -> device copy
|
||||
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
||||
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
||||
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
||||
|
||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
if (backend_src != backend_dst) {
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
||||
|
||||
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
||||
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
||||
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
} else {
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return false;
|
||||
@@ -2393,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
#endif
|
||||
}
|
||||
|
||||
// record event on src stream
|
||||
// record event on src stream after the copy
|
||||
if (!cuda_ctx_src->copy_event) {
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
||||
@@ -2405,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
||||
} else {
|
||||
// src and dst are on the same backend
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
+52
-17
@@ -210,7 +210,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_COUNT
|
||||
};
|
||||
|
||||
struct ggml_metal_context {
|
||||
struct ggml_backend_metal_context {
|
||||
int n_cb;
|
||||
|
||||
id<MTLDevice> device;
|
||||
@@ -224,6 +224,10 @@ struct ggml_metal_context {
|
||||
bool support_simdgroup_mm;
|
||||
|
||||
bool should_capture_next_compute;
|
||||
|
||||
// abort ggml_metal_graph_compute if callback returns true
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// MSL code
|
||||
@@ -289,7 +293,7 @@ static void * ggml_metal_host_malloc(size_t n) {
|
||||
return data;
|
||||
}
|
||||
|
||||
static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
||||
|
||||
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
||||
@@ -306,7 +310,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||
|
||||
// Configure context
|
||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||
struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context));
|
||||
ctx->device = device;
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||
ctx->queue = [ctx->device newCommandQueue];
|
||||
@@ -668,7 +672,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
||||
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||
|
||||
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
||||
@@ -734,7 +738,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
||||
return nil;
|
||||
}
|
||||
|
||||
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
||||
static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) {
|
||||
for (size_t i = 0, n = 3; i < n; ++i) {
|
||||
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
||||
return false;
|
||||
@@ -845,7 +849,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_metal_graph_compute(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_backend_metal_context * ctx,
|
||||
struct ggml_cgraph * gf) {
|
||||
|
||||
@autoreleasepool {
|
||||
@@ -878,8 +882,11 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
command_buffer_builder[cb_idx] = command_buffer;
|
||||
|
||||
// enqueue the command buffers in order to specify their execution order
|
||||
[command_buffer enqueue];
|
||||
// always enqueue the first two command buffers
|
||||
// enqueue all of the command buffers if we don't need to abort
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[command_buffer enqueue];
|
||||
}
|
||||
}
|
||||
|
||||
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
||||
@@ -2827,7 +2834,9 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
[encoder endEncoding];
|
||||
|
||||
[command_buffer commit];
|
||||
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||
[command_buffer commit];
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for completion and check status of each command buffer
|
||||
@@ -2847,6 +2856,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
|
||||
if (!next_buffer) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
||||
if (next_queued) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
||||
GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
||||
return GGML_STATUS_ABORTED;
|
||||
}
|
||||
|
||||
[next_buffer commit];
|
||||
}
|
||||
|
||||
if (should_capture) {
|
||||
@@ -3150,7 +3176,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||
ggml_metal_free(ctx);
|
||||
free(backend);
|
||||
}
|
||||
@@ -3162,13 +3188,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
|
||||
}
|
||||
|
||||
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||
|
||||
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||
|
||||
return ggml_metal_supports_op(metal_ctx, op);
|
||||
}
|
||||
@@ -3213,9 +3239,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_metal_init(void) {
|
||||
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||
|
||||
struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||
if (ctx == NULL) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -3237,15 +3263,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) {
|
||||
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||
}
|
||||
|
||||
void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = user_data;
|
||||
}
|
||||
|
||||
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||
|
||||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||
}
|
||||
@@ -3253,7 +3288,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||
ctx->should_capture_next_compute = true;
|
||||
}
|
||||
|
||||
|
||||
+35
-1
@@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (inet_addr(host) == INADDR_NONE) {
|
||||
fprintf(stderr, "Invalid host address: %s\n", host);
|
||||
return nullptr;
|
||||
}
|
||||
struct sockaddr_in serv_addr;
|
||||
serv_addr.sin_family = AF_INET;
|
||||
serv_addr.sin_addr.s_addr = inet_addr(host);
|
||||
@@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
|
||||
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// require that the tensor data does not go beyond the buffer end
|
||||
uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
|
||||
uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
|
||||
uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
|
||||
GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
|
||||
GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
|
||||
|
||||
result->op = (ggml_op) tensor->op;
|
||||
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
||||
result->op_params[i] = tensor->op_params[i];
|
||||
@@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
||||
uint64_t offset;
|
||||
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
||||
size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
|
||||
const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
|
||||
|
||||
struct ggml_init_params params {
|
||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||
@@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
|
||||
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||
|
||||
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
|
||||
ggml_backend_tensor_set(tensor, data, offset, size);
|
||||
ggml_free(ctx);
|
||||
@@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
|
||||
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||
|
||||
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// output serialization format: | data (size bytes) |
|
||||
output.resize(size, 0);
|
||||
ggml_backend_tensor_get(tensor, output.data(), offset, size);
|
||||
|
||||
@@ -874,7 +874,7 @@ namespace dpct
|
||||
inline std::string get_preferred_gpu_platform_name() {
|
||||
std::string result;
|
||||
|
||||
std::string filter = "level-zero";
|
||||
std::string filter = "";
|
||||
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
|
||||
if (env) {
|
||||
if (std::strstr(env, "level_zero")) {
|
||||
@@ -892,11 +892,24 @@ namespace dpct
|
||||
else {
|
||||
throw std::runtime_error("invalid device filter: " + std::string(env));
|
||||
}
|
||||
} else {
|
||||
auto default_device = sycl::device(sycl::default_selector_v);
|
||||
auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
|
||||
|
||||
if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
|
||||
filter = "level-zero";
|
||||
}
|
||||
else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
|
||||
filter = "cuda";
|
||||
}
|
||||
else if (std::strstr(default_platform_name.c_str(), "HIP")) {
|
||||
filter = "hip";
|
||||
}
|
||||
}
|
||||
|
||||
auto plaform_list = sycl::platform::get_platforms();
|
||||
auto platform_list = sycl::platform::get_platforms();
|
||||
|
||||
for (const auto& platform : plaform_list) {
|
||||
for (const auto& platform : platform_list) {
|
||||
auto devices = platform.get_devices();
|
||||
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
|
||||
return d.is_gpu();
|
||||
|
||||
@@ -2108,9 +2108,9 @@ void ggml_vk_instance_init() {
|
||||
}
|
||||
|
||||
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
||||
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
|
||||
ggml_vk_instance_init();
|
||||
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
||||
|
||||
ctx->name = GGML_VK_NAME + std::to_string(idx);
|
||||
|
||||
|
||||
+5
-1
@@ -56,6 +56,9 @@ int ggml_sve_cnt_b = 0;
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
@@ -3721,7 +3724,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
struct ggml_tensor * view_src,
|
||||
size_t view_offs) {
|
||||
|
||||
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
||||
GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
|
||||
GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
||||
|
||||
// find the base tensor and absolute offset
|
||||
if (view_src != NULL && view_src->view_src != NULL) {
|
||||
|
||||
@@ -1146,6 +1146,9 @@ class GGMLQuantizationType(IntEnum):
|
||||
F64 = 28
|
||||
IQ1_M = 29
|
||||
BF16 = 30
|
||||
Q4_0_4_4 = 31
|
||||
Q4_0_4_8 = 32
|
||||
Q4_0_8_8 = 33
|
||||
|
||||
|
||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||
@@ -1158,7 +1161,7 @@ class LlamaFileType(IntEnum):
|
||||
MOSTLY_F16 = 1 # except 1d tensors
|
||||
MOSTLY_Q4_0 = 2 # except 1d tensors
|
||||
MOSTLY_Q4_1 = 3 # except 1d tensors
|
||||
MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
||||
# MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
||||
# MOSTLY_Q4_2 = 5 # support has been removed
|
||||
# MOSTLY_Q4_3 = 6 # support has been removed
|
||||
MOSTLY_Q8_0 = 7 # except 1d tensors
|
||||
@@ -1187,6 +1190,9 @@ class LlamaFileType(IntEnum):
|
||||
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
||||
MOSTLY_IQ1_M = 31 # except 1d tensors
|
||||
MOSTLY_BF16 = 32 # except 1d tensors
|
||||
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
||||
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
||||
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
|
||||
@@ -1260,6 +1266,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
GGMLQuantizationType.F64: (1, 8),
|
||||
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
||||
GGMLQuantizationType.BF16: (1, 2),
|
||||
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
class LazyNumpyTensor(LazyBase):
|
||||
_tensor_type = np.ndarray
|
||||
|
||||
shape: tuple[int, ...] # Makes the type checker happy in quants.py
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
||||
# The initial idea was to use np.nan as the fill value,
|
||||
|
||||
+165
-73
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
from typing import Callable, Sequence
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Sequence
|
||||
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
@@ -9,32 +10,22 @@ from .lazy import LazyNumpyTensor
|
||||
import numpy as np
|
||||
|
||||
|
||||
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % block_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||
|
||||
|
||||
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||
|
||||
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||
n = n.astype(np.float32, copy=False).view(np.uint32)
|
||||
# force nan to quiet
|
||||
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
|
||||
# round to nearest even
|
||||
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||
return n.astype(np.uint16)
|
||||
|
||||
|
||||
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
||||
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
||||
def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
||||
rows = arr.reshape((-1, arr.shape[-1]))
|
||||
osize = 1
|
||||
for dim in oshape:
|
||||
@@ -46,27 +37,6 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
|
||||
return out.reshape(oshape)
|
||||
|
||||
|
||||
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
||||
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)
|
||||
|
||||
|
||||
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)
|
||||
|
||||
|
||||
def quantize_bf16(n: np.ndarray):
|
||||
if type(n) is LazyNumpyTensor:
|
||||
return __quantize_bf16_lazy(n)
|
||||
else:
|
||||
return __quantize_bf16_array(n)
|
||||
|
||||
|
||||
__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
|
||||
|
||||
|
||||
def can_quantize_to_q8_0(n: np.ndarray) -> bool:
|
||||
return n.shape[-1] % __q8_block_size == 0
|
||||
|
||||
|
||||
# round away from zero
|
||||
# ref: https://stackoverflow.com/a/59143326/22827863
|
||||
def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||
@@ -76,46 +46,168 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||
return np.sign(n) * b
|
||||
|
||||
|
||||
def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
|
||||
return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
|
||||
class QuantError(Exception): ...
|
||||
|
||||
|
||||
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
||||
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
|
||||
shape = n.shape
|
||||
assert shape[-1] % __q8_block_size == 0
|
||||
|
||||
n_blocks = n.size // __q8_block_size
|
||||
|
||||
blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
|
||||
|
||||
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
|
||||
# (n_blocks, 2)
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
# (n_blocks, block_size)
|
||||
qs = qs.astype(np.int8).view(np.uint8)
|
||||
|
||||
assert d.shape[1] + qs.shape[1] == __q8_type_size
|
||||
|
||||
return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
|
||||
_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
|
||||
|
||||
|
||||
def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
|
||||
return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
|
||||
|
||||
|
||||
__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
|
||||
__quantize_q8_0_array,
|
||||
meta_noop=(np.uint8, __quantize_q8_0_shape_change),
|
||||
)
|
||||
|
||||
|
||||
def quantize_q8_0(data: np.ndarray):
|
||||
if type(data) is LazyNumpyTensor:
|
||||
return __quantize_q8_0_lazy(data)
|
||||
def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||
if qtype == GGMLQuantizationType.F32:
|
||||
return data.astype(np.float32, copy=False)
|
||||
elif qtype == GGMLQuantizationType.F16:
|
||||
return data.astype(np.float16, copy=False)
|
||||
elif (q := _type_traits.get(qtype)) is not None:
|
||||
return q.quantize(data)
|
||||
else:
|
||||
return __quantize_q8_0_array(data)
|
||||
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
|
||||
|
||||
|
||||
def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||
if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
|
||||
return data.astype(np.float32, copy=False)
|
||||
elif (q := _type_traits.get(qtype)) is not None:
|
||||
return q.dequantize(data)
|
||||
else:
|
||||
raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
|
||||
|
||||
|
||||
class __Quant(ABC):
|
||||
qtype: GGMLQuantizationType
|
||||
block_size: int
|
||||
type_size: int
|
||||
|
||||
def __init__(self):
|
||||
return TypeError("Quant conversion classes can't have instances")
|
||||
|
||||
def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
|
||||
cls.qtype = qtype
|
||||
cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
|
||||
cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
|
||||
cls.__quantize_array,
|
||||
meta_noop=(np.uint8, cls.__shape_to_bytes)
|
||||
)
|
||||
cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
|
||||
cls.__dequantize_array,
|
||||
meta_noop=(np.float32, cls.__shape_from_bytes)
|
||||
)
|
||||
assert qtype not in _type_traits
|
||||
_type_traits[qtype] = cls
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
|
||||
rows = rows.astype(np.float32, copy=False)
|
||||
shape = rows.shape
|
||||
n_blocks = rows.size // cls.block_size
|
||||
blocks = rows.reshape((n_blocks, cls.block_size))
|
||||
blocks = cls.quantize_blocks(blocks)
|
||||
assert blocks.dtype == np.uint8
|
||||
assert blocks.shape[-1] == cls.type_size
|
||||
return blocks.reshape(cls.__shape_to_bytes(shape))
|
||||
|
||||
@classmethod
|
||||
def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
|
||||
rows = rows.view(np.uint8)
|
||||
shape = rows.shape
|
||||
n_blocks = rows.size // cls.type_size
|
||||
blocks = rows.reshape((n_blocks, cls.type_size))
|
||||
blocks = cls.dequantize_blocks(blocks)
|
||||
assert blocks.dtype == np.float32
|
||||
assert blocks.shape[-1] == cls.block_size
|
||||
return blocks.reshape(cls.__shape_from_bytes(shape))
|
||||
|
||||
@classmethod
|
||||
def __shape_to_bytes(cls, shape: Sequence[int]):
|
||||
return quant_shape_to_byte_shape(shape, cls.qtype)
|
||||
|
||||
@classmethod
|
||||
def __shape_from_bytes(cls, shape: Sequence[int]):
|
||||
return quant_shape_from_byte_shape(shape, cls.qtype)
|
||||
|
||||
@classmethod
|
||||
def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
|
||||
return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
|
||||
|
||||
@classmethod
|
||||
def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
|
||||
return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
|
||||
|
||||
@classmethod
|
||||
def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
|
||||
return tensor.shape[-1] % cls.block_size == 0
|
||||
|
||||
@classmethod
|
||||
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
|
||||
if not cls.can_quantize(tensor):
|
||||
raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
|
||||
if isinstance(tensor, LazyNumpyTensor):
|
||||
return cls.__quantize_lazy(tensor)
|
||||
else:
|
||||
return cls.__quantize_array(tensor)
|
||||
|
||||
@classmethod
|
||||
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
|
||||
if isinstance(tensor, LazyNumpyTensor):
|
||||
return cls.__dequantize_lazy(tensor)
|
||||
else:
|
||||
return cls.__dequantize_array(tensor)
|
||||
|
||||
|
||||
class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
|
||||
@classmethod
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n = blocks.view(np.uint32)
|
||||
# force nan to quiet
|
||||
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
|
||||
# round to nearest even
|
||||
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||
return n.astype(np.uint16).view(np.uint8)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
|
||||
|
||||
|
||||
class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
|
||||
@classmethod
|
||||
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
|
||||
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
|
||||
# (n_blocks, 2)
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
# (n_blocks, block_size)
|
||||
qs = qs.astype(np.int8).view(np.uint8)
|
||||
|
||||
return np.concatenate([d, qs], axis=1)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
d, x = np.split(blocks, [2], axis=1)
|
||||
d = d.view(np.float16).astype(np.float32)
|
||||
x = x.view(np.int8).astype(np.float32)
|
||||
|
||||
return (x * d)
|
||||
|
||||
+13
-6
@@ -62,6 +62,7 @@ while read c; do
|
||||
src/ggml*.m \
|
||||
src/ggml*.metal \
|
||||
src/ggml*.cu \
|
||||
src/ggml-cann/* \
|
||||
src/ggml-cuda/* \
|
||||
src/ggml-sycl/* \
|
||||
src/vulkan-shaders/* \
|
||||
@@ -108,6 +109,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# src/ggml-alloc.c -> ggml/src/ggml-alloc.c
|
||||
# src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
|
||||
# src/ggml-backend.c -> ggml/src/ggml-backend.c
|
||||
# src/ggml-cann/* -> ggml/src/ggml-cann/
|
||||
# src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp
|
||||
# src/ggml-common.h -> ggml/src/ggml-common.h
|
||||
# src/ggml-cuda/* -> ggml/src/ggml-cuda/
|
||||
# src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu
|
||||
@@ -126,6 +129,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# include/ggml-alloc.h -> ggml/include/ggml-alloc.h
|
||||
# include/ggml-backend.h -> ggml/include/ggml-backend.h
|
||||
# include/ggml-blas.h -> ggml/include/ggml-blas.h
|
||||
# include/ggml-cann.h -> ggml/include/ggml-cann.h
|
||||
# include/ggml-cuda.h -> ggml/include/ggml-cuda.h
|
||||
# include/ggml-kompute.h -> ggml/include/ggml-kompute.h
|
||||
# include/ggml-metal.h -> ggml/include/ggml-metal.h
|
||||
@@ -152,6 +156,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
|
||||
@@ -169,18 +175,19 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/\1examples\/common-ggml.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)LICENSE/\1LICENSE/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \
|
||||
> ggml-src.patch.tmp
|
||||
mv ggml-src.patch.tmp ggml-src.patch
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
18703ad600cc68dbdb04d57434c876989a841d12
|
||||
797faa25af14126eb30134d4033139ae3c5428ed
|
||||
|
||||
@@ -10,6 +10,8 @@ cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h
|
||||
cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c
|
||||
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
|
||||
cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c
|
||||
cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
|
||||
cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp
|
||||
cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h
|
||||
cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
|
||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu
|
||||
@@ -29,6 +31,7 @@ cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h
|
||||
cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h
|
||||
cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h
|
||||
cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h
|
||||
cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h
|
||||
cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h
|
||||
cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
|
||||
cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h
|
||||
|
||||
@@ -24,3 +24,18 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// helpers
|
||||
//
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
}
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,20 +16,6 @@
|
||||
// helpers
|
||||
//
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
|
||||
+42
-36
@@ -121,17 +121,6 @@ static std::string trim(const std::string & str) {
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
}
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_float_close(float a, float b, float abs_tol) {
|
||||
// Check for non-negative tolerance
|
||||
if (abs_tol < 0.0) {
|
||||
@@ -13178,13 +13167,13 @@ struct llm_build_context {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
@@ -13218,7 +13207,7 @@ struct llm_build_context {
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@@ -13292,13 +13281,13 @@ struct llm_build_context {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
|
||||
@@ -13345,7 +13334,7 @@ struct llm_build_context {
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@@ -13362,13 +13351,13 @@ struct llm_build_context {
|
||||
|
||||
// cross-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
@@ -13397,7 +13386,7 @@ struct llm_build_context {
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@@ -13454,7 +13443,7 @@ struct llm_build_context {
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
}
|
||||
|
||||
@@ -15304,7 +15293,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
||||
if (n_expert > 1) {
|
||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||
// tensor name.
|
||||
@@ -17343,6 +17332,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
||||
// TODO: replace all non-fatal assertions with returned errors or exceptions
|
||||
struct llama_data_write {
|
||||
virtual void write(const void * src, size_t size) = 0;
|
||||
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
|
||||
virtual size_t get_size_written() = 0;
|
||||
virtual ~llama_data_write() = default;
|
||||
|
||||
@@ -17465,9 +17455,8 @@ struct llama_data_write {
|
||||
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
tmp_buf.resize(range_size * k_size_row);
|
||||
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
||||
write(tmp_buf.data(), tmp_buf.size());
|
||||
const size_t buf_size = range_size * k_size_row;
|
||||
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17486,9 +17475,8 @@ struct llama_data_write {
|
||||
// Read each range of cells of v_size length each into tmp_buf and write out
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
tmp_buf.resize(range_size * v_size_row);
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
||||
write(tmp_buf.data(), tmp_buf.size());
|
||||
const size_t buf_size = range_size * v_size_row;
|
||||
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -17514,9 +17502,8 @@ struct llama_data_write {
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
||||
tmp_buf.resize(range_size * v_size_el);
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
||||
write(tmp_buf.data(), tmp_buf.size());
|
||||
const size_t buf_size = range_size * v_size_el;
|
||||
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17875,12 +17862,14 @@ struct llama_data_write_dummy : llama_data_write {
|
||||
|
||||
llama_data_write_dummy() {}
|
||||
|
||||
// TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
|
||||
|
||||
void write(const void * /* src */, size_t size) override {
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
@@ -17903,6 +17892,16 @@ struct llama_data_write_buffer : llama_data_write {
|
||||
buf_size -= size;
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
|
||||
if (size > buf_size) {
|
||||
throw std::runtime_error("unexpectedly reached end of buffer");
|
||||
}
|
||||
ggml_backend_tensor_get(tensor, ptr, offset, size);
|
||||
ptr += size;
|
||||
size_written += size;
|
||||
buf_size -= size;
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
@@ -17938,6 +17937,7 @@ struct llama_data_read_buffer : llama_data_read {
|
||||
struct llama_data_write_file : llama_data_write {
|
||||
llama_file * file;
|
||||
size_t size_written = 0;
|
||||
std::vector<uint8_t> temp_buffer;
|
||||
|
||||
llama_data_write_file(llama_file * f) : file(f) {}
|
||||
|
||||
@@ -17946,6 +17946,12 @@ struct llama_data_write_file : llama_data_write {
|
||||
size_written += size;
|
||||
}
|
||||
|
||||
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
|
||||
temp_buffer.resize(size);
|
||||
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
|
||||
write(temp_buffer.data(), temp_buffer.size());
|
||||
}
|
||||
|
||||
size_t get_size_written() override {
|
||||
return size_written;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user