mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 18:47:39 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1182cf4d4f | |||
| fe54033b69 | |||
| 5eaf9964fc | |||
| d292f4f204 | |||
| 256d1bb0dd | |||
| faa3526a1e | |||
| ddc5a5033f | |||
| cd4fddb29f | |||
| c9b316c78f | |||
| bf63d695b8 | |||
| 1387ea2117 |
@@ -225,6 +225,9 @@ effectiveStdenv.mkDerivation (
|
||||
description = "contains numpy and sentencepiece";
|
||||
buildInputs = [ llama-python ];
|
||||
inputsFrom = [ finalAttrs.finalPackage ];
|
||||
shellHook = ''
|
||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||
'';
|
||||
};
|
||||
|
||||
shell-extra = mkShell {
|
||||
|
||||
@@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
|
||||
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||
|
||||
|
||||
**Bindings:**
|
||||
|
||||
+11
-1
@@ -129,6 +129,8 @@ static void sampler_queue(
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const float dynatemp_range = params.dynatemp_range;
|
||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
@@ -143,7 +145,15 @@ static void sampler_queue(
|
||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
||||
case 't':
|
||||
if (dynatemp_range > 0) {
|
||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
||||
} else {
|
||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||
}
|
||||
break;
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
|
||||
@@ -30,6 +30,7 @@ android {
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
# Function calling example using pydantic models.
|
||||
import datetime
|
||||
import importlib
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import Union, Optional
|
||||
from typing import Optional, Union
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
import importlib
|
||||
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
|
||||
from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
|
||||
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
|
||||
|
||||
|
||||
# Function to get completion on the llama.cpp server with grammar.
|
||||
@@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
|
||||
print(self.message)
|
||||
|
||||
|
||||
# Enum for the calculator function.
|
||||
# Enum for the calculator tool.
|
||||
class MathOperation(Enum):
|
||||
ADD = "add"
|
||||
SUBTRACT = "subtract"
|
||||
@@ -43,7 +43,7 @@ class MathOperation(Enum):
|
||||
DIVIDE = "divide"
|
||||
|
||||
|
||||
# Very simple calculator tool for the agent.
|
||||
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
||||
class Calculator(BaseModel):
|
||||
"""
|
||||
Perform a math operation on two numbers.
|
||||
@@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
|
||||
return datetime.datetime.now().strftime(output_format)
|
||||
|
||||
|
||||
# Enum for the calculator tool.
|
||||
class MathOperation(Enum):
|
||||
ADD = "add"
|
||||
SUBTRACT = "subtract"
|
||||
MULTIPLY = "multiply"
|
||||
DIVIDE = "divide"
|
||||
|
||||
|
||||
|
||||
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
||||
class Calculator(BaseModel):
|
||||
"""
|
||||
Perform a math operation on two numbers.
|
||||
"""
|
||||
number_one: Union[int, float] = Field(..., description="First number.")
|
||||
operation: MathOperation = Field(..., description="Math operation to perform.")
|
||||
number_two: Union[int, float] = Field(..., description="Second number.")
|
||||
|
||||
def run(self):
|
||||
if self.operation == MathOperation.ADD:
|
||||
return self.number_one + self.number_two
|
||||
elif self.operation == MathOperation.SUBTRACT:
|
||||
return self.number_one - self.number_two
|
||||
elif self.operation == MathOperation.MULTIPLY:
|
||||
return self.number_one * self.number_two
|
||||
elif self.operation == MathOperation.DIVIDE:
|
||||
return self.number_one / self.number_two
|
||||
else:
|
||||
raise ValueError("Unknown operation.")
|
||||
|
||||
|
||||
# Example function to get the weather
|
||||
def get_current_weather(location, unit):
|
||||
"""Get the current weather in a given location"""
|
||||
|
||||
@@ -1,15 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import re
|
||||
from copy import copy
|
||||
from inspect import isclass, getdoc
|
||||
from types import NoneType
|
||||
from enum import Enum
|
||||
from inspect import getdoc, isclass
|
||||
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
|
||||
|
||||
from docstring_parser import parse
|
||||
from pydantic import BaseModel, create_model, Field
|
||||
from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
|
||||
from enum import Enum
|
||||
from typing import get_type_hints, Callable
|
||||
import re
|
||||
from pydantic import BaseModel, Field, create_model
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from types import GenericAlias
|
||||
else:
|
||||
# python 3.8 compat
|
||||
from typing import _GenericAlias as GenericAlias
|
||||
|
||||
|
||||
class PydanticDataType(Enum):
|
||||
@@ -43,7 +49,7 @@ class PydanticDataType(Enum):
|
||||
SET = "set"
|
||||
|
||||
|
||||
def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
|
||||
def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
|
||||
if isclass(pydantic_type) and issubclass(pydantic_type, str):
|
||||
return PydanticDataType.STRING.value
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
|
||||
@@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
|
||||
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
|
||||
return format_model_and_field_name(pydantic_type.__name__)
|
||||
elif get_origin(pydantic_type) == list:
|
||||
elif get_origin(pydantic_type) is list:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"{map_pydantic_type_to_gbnf(element_type)}-list"
|
||||
elif get_origin(pydantic_type) == set:
|
||||
elif get_origin(pydantic_type) is set:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"{map_pydantic_type_to_gbnf(element_type)}-set"
|
||||
elif get_origin(pydantic_type) == Union:
|
||||
elif get_origin(pydantic_type) is Union:
|
||||
union_types = get_args(pydantic_type)
|
||||
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
|
||||
return f"union-{'-or-'.join(union_rules)}"
|
||||
elif get_origin(pydantic_type) == Optional:
|
||||
elif get_origin(pydantic_type) is Optional:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
|
||||
elif isclass(pydantic_type):
|
||||
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
|
||||
elif get_origin(pydantic_type) == dict:
|
||||
elif get_origin(pydantic_type) is dict:
|
||||
key_type, value_type = get_args(pydantic_type)
|
||||
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
|
||||
else:
|
||||
@@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
|
||||
return f"{cls.__name__.lower()} ::= " + " | ".join(members)
|
||||
if cls.__annotations__ and cls.__annotations__ != {}:
|
||||
result = f'{rule_name} ::= "{{"'
|
||||
type_list_rules = []
|
||||
# Modify this comprehension
|
||||
members = [
|
||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
|
||||
@@ -116,27 +121,25 @@ def get_members_structure(cls, rule_name):
|
||||
|
||||
result += '"," '.join(members)
|
||||
result += ' "}"'
|
||||
return result, type_list_rules
|
||||
elif rule_name == "custom-class-any":
|
||||
return result
|
||||
if rule_name == "custom-class-any":
|
||||
result = f"{rule_name} ::= "
|
||||
result += "value"
|
||||
type_list_rules = []
|
||||
return result, type_list_rules
|
||||
else:
|
||||
init_signature = inspect.signature(cls.__init__)
|
||||
parameters = init_signature.parameters
|
||||
result = f'{rule_name} ::= "{{"'
|
||||
type_list_rules = []
|
||||
# Modify this comprehension too
|
||||
members = [
|
||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}'
|
||||
for name, param in parameters.items()
|
||||
if name != "self" and param.annotation != inspect.Parameter.empty
|
||||
]
|
||||
return result
|
||||
|
||||
result += '", "'.join(members)
|
||||
result += ' "}"'
|
||||
return result, type_list_rules
|
||||
init_signature = inspect.signature(cls.__init__)
|
||||
parameters = init_signature.parameters
|
||||
result = f'{rule_name} ::= "{{"'
|
||||
# Modify this comprehension too
|
||||
members = [
|
||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}'
|
||||
for name, param in parameters.items()
|
||||
if name != "self" and param.annotation != inspect.Parameter.empty
|
||||
]
|
||||
|
||||
result += '", "'.join(members)
|
||||
result += ' "}"'
|
||||
return result
|
||||
|
||||
|
||||
def regex_to_gbnf(regex_pattern: str) -> str:
|
||||
@@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
|
||||
|
||||
def generate_gbnf_rule_for_type(
|
||||
model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
|
||||
) -> Tuple[str, list]:
|
||||
) -> tuple[str, list[str]]:
|
||||
"""
|
||||
Generate GBNF rule for a given field type.
|
||||
|
||||
@@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
|
||||
:param field_info: Additional information about the field (optional).
|
||||
|
||||
:return: Tuple containing the GBNF type and a list of additional rules.
|
||||
:rtype: Tuple[str, list]
|
||||
:rtype: tuple[str, list]
|
||||
"""
|
||||
rules = []
|
||||
|
||||
@@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
|
||||
gbnf_type, rules = model_name + "-" + field_name, rules
|
||||
|
||||
elif gbnf_type.startswith("custom-class-"):
|
||||
nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
|
||||
rules.append(nested_model_rules)
|
||||
rules.append(get_members_structure(field_type, gbnf_type))
|
||||
elif gbnf_type.startswith("custom-dict-"):
|
||||
key_type, value_type = get_args(field_type)
|
||||
|
||||
@@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
|
||||
union_rules = []
|
||||
|
||||
for union_type in union_types:
|
||||
if isinstance(union_type, _GenericAlias):
|
||||
if isinstance(union_type, GenericAlias):
|
||||
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
||||
model_name, field_name, union_type, False, processed_models, created_rules
|
||||
)
|
||||
union_rules.append(union_gbnf_type)
|
||||
rules.extend(union_rules_list)
|
||||
|
||||
elif not issubclass(union_type, NoneType):
|
||||
elif not issubclass(union_type, type(None)):
|
||||
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
||||
model_name, field_name, union_type, False, processed_models, created_rules
|
||||
)
|
||||
@@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
|
||||
else:
|
||||
gbnf_type, rules = gbnf_type, []
|
||||
|
||||
if gbnf_type not in created_rules:
|
||||
return gbnf_type, rules
|
||||
else:
|
||||
if gbnf_type in created_rules:
|
||||
return gbnf_type, rules
|
||||
return gbnf_type, rules
|
||||
|
||||
|
||||
def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
|
||||
def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
|
||||
"""
|
||||
|
||||
Generate GBnF Grammar
|
||||
@@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
|
||||
```
|
||||
"""
|
||||
if model in processed_models:
|
||||
return []
|
||||
return [], False
|
||||
|
||||
processed_models.add(model)
|
||||
model_name = format_model_and_field_name(model.__name__)
|
||||
@@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
|
||||
|
||||
|
||||
def generate_gbnf_grammar_from_pydantic_models(
|
||||
models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
|
||||
models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
|
||||
list_of_outputs: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
@@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
|
||||
* grammar.
|
||||
|
||||
Args:
|
||||
models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
|
||||
models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
|
||||
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
||||
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
||||
list_of_outputs (str, optional): Allows a list of output objects
|
||||
@@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
|
||||
# root ::= UserModel | PostModel
|
||||
# ...
|
||||
"""
|
||||
processed_models = set()
|
||||
processed_models: set[type[BaseModel]] = set()
|
||||
all_rules = []
|
||||
created_rules = {}
|
||||
created_rules: dict[str, list[str]] = {}
|
||||
if outer_object_name is None:
|
||||
for model in models:
|
||||
model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
|
||||
@@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
|
||||
Returns:
|
||||
str: GBNF primitive grammar string.
|
||||
"""
|
||||
type_list = []
|
||||
type_list: list[type[object]] = []
|
||||
if "string-list" in grammar:
|
||||
type_list.append(str)
|
||||
if "boolean-list" in grammar:
|
||||
@@ -666,14 +664,14 @@ triple-quotes ::= "'''" """
|
||||
|
||||
|
||||
def generate_markdown_documentation(
|
||||
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
Generate markdown documentation for a list of Pydantic models.
|
||||
|
||||
Args:
|
||||
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
|
||||
pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
|
||||
model_prefix (str): Prefix for the model section.
|
||||
fields_prefix (str): Prefix for the fields section.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
@@ -731,7 +729,7 @@ def generate_markdown_documentation(
|
||||
|
||||
|
||||
def generate_field_markdown(
|
||||
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
|
||||
field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
@@ -739,8 +737,8 @@ def generate_field_markdown(
|
||||
|
||||
Args:
|
||||
field_name (str): Name of the field.
|
||||
field_type (Type[Any]): Type of the field.
|
||||
model (Type[BaseModel]): Pydantic model class.
|
||||
field_type (type[Any]): Type of the field.
|
||||
model (type[BaseModel]): Pydantic model class.
|
||||
depth (int): Indentation depth in the documentation.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
|
||||
@@ -798,7 +796,7 @@ def generate_field_markdown(
|
||||
return field_text
|
||||
|
||||
|
||||
def format_json_example(example: dict, depth: int) -> str:
|
||||
def format_json_example(example: dict[str, Any], depth: int) -> str:
|
||||
"""
|
||||
Format a JSON example into a readable string with indentation.
|
||||
|
||||
@@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:
|
||||
|
||||
|
||||
def generate_text_documentation(
|
||||
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
Generate text documentation for a list of Pydantic models.
|
||||
|
||||
Args:
|
||||
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
|
||||
pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
|
||||
model_prefix (str): Prefix for the model section.
|
||||
fields_prefix (str): Prefix for the fields section.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
@@ -885,7 +883,7 @@ def generate_text_documentation(
|
||||
|
||||
|
||||
def generate_field_text(
|
||||
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
|
||||
field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
@@ -893,8 +891,8 @@ def generate_field_text(
|
||||
|
||||
Args:
|
||||
field_name (str): Name of the field.
|
||||
field_type (Type[Any]): Type of the field.
|
||||
model (Type[BaseModel]): Pydantic model class.
|
||||
field_type (type[Any]): Type of the field.
|
||||
model (type[BaseModel]): Pydantic model class.
|
||||
depth (int): Indentation depth in the documentation.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
|
||||
@@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
|
||||
pydantic_model_list,
|
||||
grammar_file_path="./generated_grammar.gbnf",
|
||||
documentation_file_path="./generated_grammar_documentation.md",
|
||||
outer_object_name: str = None,
|
||||
outer_object_content: str = None,
|
||||
outer_object_name: str | None = None,
|
||||
outer_object_content: str | None = None,
|
||||
model_prefix: str = "Output Model",
|
||||
fields_prefix: str = "Output Fields",
|
||||
list_of_outputs: bool = False,
|
||||
@@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
|
||||
|
||||
def generate_gbnf_grammar_and_documentation(
|
||||
pydantic_model_list,
|
||||
outer_object_name: str = None,
|
||||
outer_object_content: str = None,
|
||||
outer_object_name: str | None = None,
|
||||
outer_object_content: str | None = None,
|
||||
model_prefix: str = "Output Model",
|
||||
fields_prefix: str = "Output Fields",
|
||||
list_of_outputs: bool = False,
|
||||
@@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(
|
||||
|
||||
|
||||
def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
||||
dictionaries: List[dict],
|
||||
outer_object_name: str = None,
|
||||
outer_object_content: str = None,
|
||||
dictionaries: list[dict[str, Any]],
|
||||
outer_object_name: str | None = None,
|
||||
outer_object_content: str | None = None,
|
||||
model_prefix: str = "Output Model",
|
||||
fields_prefix: str = "Output Fields",
|
||||
list_of_outputs: bool = False,
|
||||
@@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
||||
Generate GBNF grammar and documentation from a list of dictionaries.
|
||||
|
||||
Args:
|
||||
dictionaries (List[dict]): List of dictionaries representing Pydantic models.
|
||||
dictionaries (list[dict]): List of dictionaries representing Pydantic models.
|
||||
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
||||
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
||||
model_prefix (str): Prefix for the model section in the documentation.
|
||||
@@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
||||
return grammar, documentation
|
||||
|
||||
|
||||
def create_dynamic_model_from_function(func: Callable):
|
||||
def create_dynamic_model_from_function(func: Callable[..., Any]):
|
||||
"""
|
||||
Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
|
||||
|
||||
@@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
|
||||
sig = inspect.signature(func)
|
||||
|
||||
# Parse the docstring
|
||||
assert func.__doc__ is not None
|
||||
docstring = parse(func.__doc__)
|
||||
|
||||
dynamic_fields = {}
|
||||
@@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
|
||||
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
|
||||
|
||||
# Add parameter details to the schema
|
||||
param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
|
||||
param_docs.append((param.name, param_doc))
|
||||
if param.default == inspect.Parameter.empty:
|
||||
default_value = ...
|
||||
@@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
|
||||
dynamic_fields[param.name] = (
|
||||
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
|
||||
# Creating the dynamic model
|
||||
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
|
||||
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload]
|
||||
|
||||
for param_doc in param_docs:
|
||||
dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
|
||||
for name, param_doc in param_docs:
|
||||
dynamic_model.model_fields[name].description = param_doc.description
|
||||
|
||||
dynamic_model.__doc__ = docstring.short_description
|
||||
|
||||
@@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
|
||||
return dynamic_model
|
||||
|
||||
|
||||
def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
|
||||
def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
|
||||
"""
|
||||
Add a 'run' method to a dynamic Pydantic model, using the provided function.
|
||||
|
||||
Args:
|
||||
model (Type[BaseModel]): Dynamic Pydantic model class.
|
||||
model (type[BaseModel]): Dynamic Pydantic model class.
|
||||
func (Callable): Function to be added as a 'run' method to the model.
|
||||
|
||||
Returns:
|
||||
Type[BaseModel]: Pydantic model class with the added 'run' method.
|
||||
type[BaseModel]: Pydantic model class with the added 'run' method.
|
||||
"""
|
||||
|
||||
def run_method_wrapper(self):
|
||||
@@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
|
||||
return model
|
||||
|
||||
|
||||
def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
|
||||
def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
|
||||
"""
|
||||
Create a list of dynamic Pydantic model classes from a list of dictionaries.
|
||||
|
||||
Args:
|
||||
dictionaries (List[dict]): List of dictionaries representing model structures.
|
||||
dictionaries (list[dict]): List of dictionaries representing model structures.
|
||||
|
||||
Returns:
|
||||
List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
|
||||
list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
|
||||
"""
|
||||
dynamic_models = []
|
||||
for func in dictionaries:
|
||||
@@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
|
||||
return Enum(enum_name, {value: value for value in values})
|
||||
|
||||
|
||||
def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
|
||||
def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
|
||||
"""
|
||||
Convert a dictionary to a Pydantic model class.
|
||||
|
||||
@@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
|
||||
model_name (str): Name of the generated Pydantic model.
|
||||
|
||||
Returns:
|
||||
Type[BaseModel]: Generated Pydantic model class.
|
||||
type[BaseModel]: Generated Pydantic model class.
|
||||
"""
|
||||
fields = {}
|
||||
fields: dict[str, Any] = {}
|
||||
|
||||
if "properties" in dictionary:
|
||||
for field_name, field_data in dictionary.get("properties", {}).items():
|
||||
@@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
|
||||
if items != {}:
|
||||
array = {"properties": items}
|
||||
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
|
||||
fields[field_name] = (List[array_type], ...)
|
||||
fields[field_name] = (List[array_type], ...) # type: ignore[valid-type]
|
||||
else:
|
||||
fields[field_name] = (list, ...)
|
||||
elif field_type == "object":
|
||||
|
||||
+2
-2
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||
if (block->size >= size) {
|
||||
best_fit_block = alloc->n_free_blocks - 1;
|
||||
} else {
|
||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
||||
__func__, size, max_avail);
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
|
||||
__func__, tensor->name, size, max_avail);
|
||||
GGML_ASSERT(!"not enough space in the buffer");
|
||||
return;
|
||||
}
|
||||
|
||||
+3
-3
@@ -4283,7 +4283,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
q8 += 8;
|
||||
aux32 >>= 7;
|
||||
}
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
return d * sumi;
|
||||
#else
|
||||
// iqs is 0...15
|
||||
@@ -4294,7 +4294,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
|
||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
||||
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
||||
@@ -4339,7 +4339,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
|
||||
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
||||
#else
|
||||
assert(false);
|
||||
|
||||
+8
-5
@@ -277,6 +277,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
@@ -315,13 +319,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// print MTL GPU family:
|
||||
|
||||
@@ -1669,6 +1669,9 @@ struct llama_context {
|
||||
for (ggml_backend_t backend : backends) {
|
||||
ggml_backend_free(backend);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_free(buf_input);
|
||||
ggml_free(ctx_input);
|
||||
}
|
||||
|
||||
llama_cparams cparams;
|
||||
@@ -1715,8 +1718,14 @@ struct llama_context {
|
||||
// allocator for the input tensors
|
||||
ggml_tallocr * alloc = nullptr;
|
||||
|
||||
// temporary buffer for copying data to/from the backend
|
||||
std::vector<no_init<uint8_t>> buf_copy;
|
||||
// input tensors
|
||||
ggml_backend_buffer_t buf_input = nullptr;
|
||||
ggml_context * ctx_input = nullptr;
|
||||
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
@@ -4089,22 +4098,24 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
const llama_hparams & hparams,
|
||||
const llama_batch & batch,
|
||||
struct ggml_tensor * tok_embd,
|
||||
struct ggml_tensor * inp_tokens,
|
||||
struct ggml_tensor * inp_embd,
|
||||
const llm_build_cb & cb) {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
if (batch.token) {
|
||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||
struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
|
||||
cb(inp_tokens, "inp_tokens", -1);
|
||||
|
||||
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
|
||||
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
|
||||
} else {
|
||||
#ifdef GGML_USE_MPI
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
#endif
|
||||
|
||||
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
||||
inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
|
||||
}
|
||||
|
||||
return inpL;
|
||||
@@ -4118,6 +4129,7 @@ static void llm_build_k_shift(
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache & kv,
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * K_shift,
|
||||
llm_rope_type type,
|
||||
int64_t n_ctx,
|
||||
float freq_base,
|
||||
@@ -4134,9 +4146,6 @@ static void llm_build_k_shift(
|
||||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
||||
cb(K_shift, "K_shift", -1);
|
||||
|
||||
int rope_type = 0;
|
||||
|
||||
switch (type) {
|
||||
@@ -4457,6 +4466,7 @@ static struct ggml_tensor * llm_build_kv(
|
||||
|
||||
struct llm_build_context {
|
||||
const llama_model & model;
|
||||
const llama_context & lctx;
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
const llama_batch & batch;
|
||||
@@ -4503,6 +4513,7 @@ struct llm_build_context {
|
||||
const llm_build_cb & cb,
|
||||
bool worst_case) :
|
||||
model (lctx.model),
|
||||
lctx (lctx),
|
||||
hparams (model.hparams),
|
||||
cparams (lctx.cparams),
|
||||
batch (batch),
|
||||
@@ -4563,20 +4574,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4747,20 +4758,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4868,20 +4879,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4990,15 +5001,15 @@ struct llm_build_context {
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
@@ -5087,19 +5098,19 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5294,11 +5305,11 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5384,11 +5395,11 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
inpL = llm_build_norm(ctx0, inpL, hparams,
|
||||
@@ -5477,11 +5488,11 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5573,20 +5584,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5696,20 +5707,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5810,20 +5821,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5931,20 +5942,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * ffn_output;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -6053,20 +6064,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -6160,15 +6171,15 @@ struct llm_build_context {
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
@@ -6258,20 +6269,20 @@ struct llm_build_context {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -6365,15 +6376,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
// check if we should build the worst-case graph (for memory measurement)
|
||||
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
||||
|
||||
// keep track of the input that has already been allocated
|
||||
bool alloc_inp_tokens = false;
|
||||
bool alloc_inp_embd = false;
|
||||
bool alloc_inp_pos = false;
|
||||
bool alloc_inp_KQ_mask = false;
|
||||
bool alloc_inp_K_shift = false;
|
||||
|
||||
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
||||
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
||||
if (il >= 0) {
|
||||
ggml_format_name(cur, "%s-%d", name, il);
|
||||
@@ -6381,127 +6384,79 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
|
||||
|
||||
if (!lctx.cparams.offload_kqv) {
|
||||
if (strcmp(name, "kqv_merged_cont") == 0) {
|
||||
// all nodes between the KV store and the attention output are run on the CPU
|
||||
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// allocate input tensors and set input data
|
||||
//
|
||||
|
||||
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
|
||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
||||
|
||||
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
|
||||
const int64_t n_tokens = cur->ne[0];
|
||||
|
||||
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
||||
}
|
||||
|
||||
alloc_inp_tokens = true;
|
||||
}
|
||||
|
||||
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
|
||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
||||
|
||||
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
|
||||
ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
|
||||
}
|
||||
|
||||
alloc_inp_embd = true;
|
||||
}
|
||||
|
||||
if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
|
||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
||||
|
||||
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
|
||||
const int64_t n_tokens = cur->ne[0];
|
||||
|
||||
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
|
||||
ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
|
||||
}
|
||||
|
||||
alloc_inp_pos = true;
|
||||
}
|
||||
|
||||
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
|
||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
||||
|
||||
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
||||
const int64_t n_kv = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
|
||||
float * data;
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
data = (float *) cur->data;
|
||||
} else {
|
||||
lctx.buf_copy.resize(ggml_nbytes(cur));
|
||||
data = (float *) lctx.buf_copy.data();
|
||||
}
|
||||
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
const llama_pos pos = batch.pos[j];
|
||||
const llama_seq_id seq_id = batch.seq_id[j][0];
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
float f;
|
||||
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
||||
f = -INFINITY;
|
||||
} else {
|
||||
f = 0;
|
||||
}
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (data != cur->data) {
|
||||
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
||||
}
|
||||
}
|
||||
|
||||
alloc_inp_KQ_mask = true;
|
||||
}
|
||||
|
||||
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
|
||||
ggml_tallocr_alloc(lctx.alloc, cur);
|
||||
|
||||
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
||||
const int64_t n_ctx = cur->ne[0];
|
||||
|
||||
int32_t * data;
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
data = (int32_t *) cur->data;
|
||||
} else {
|
||||
lctx.buf_copy.resize(ggml_nbytes(cur));
|
||||
data = (int32_t *) lctx.buf_copy.data();
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_ctx; ++i) {
|
||||
data[i] = lctx.kv_self.cells[i].delta;
|
||||
}
|
||||
|
||||
if (data != cur->data) {
|
||||
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
||||
}
|
||||
}
|
||||
|
||||
alloc_inp_K_shift = true;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_cgraph * result = NULL;
|
||||
|
||||
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
||||
|
||||
//
|
||||
// set input data
|
||||
//
|
||||
|
||||
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
||||
if (batch.token) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
||||
}
|
||||
|
||||
if (batch.embd) {
|
||||
const int64_t n_embd = llm.n_embd;
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
}
|
||||
|
||||
if (batch.pos) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
||||
}
|
||||
|
||||
{
|
||||
const int64_t n_kv = llm.n_kv;
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
||||
float * data = (float *) lctx.inp_KQ_mask->data;
|
||||
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
const llama_pos pos = batch.pos[j];
|
||||
const llama_seq_id seq_id = batch.seq_id[j][0];
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
float f;
|
||||
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
||||
f = -INFINITY;
|
||||
} else {
|
||||
f = 0;
|
||||
}
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (llm.do_rope_shift) {
|
||||
const int64_t n_ctx = llm.n_ctx;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
||||
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
||||
|
||||
for (int i = 0; i < n_ctx; ++i) {
|
||||
data[i] = lctx.kv_self.cells[i].delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llm.init();
|
||||
|
||||
switch (model.arch) {
|
||||
@@ -8001,10 +7956,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
|
||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
if (k == (int) candidates->size) {
|
||||
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
||||
} else {
|
||||
if (k <= 128) {
|
||||
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
||||
} else {
|
||||
constexpr int nbuckets = 128;
|
||||
constexpr float bucket_low = -10.0f;
|
||||
constexpr float bucket_high = 10.0f;
|
||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||
constexpr float bucker_inter = -bucket_low * bucket_scale;
|
||||
|
||||
std::vector<int> bucket_idx(candidates->size);
|
||||
std::vector<int> histo(nbuckets, 0);
|
||||
|
||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||
const float val = candidates->data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||
bucket_idx[i] = ib;
|
||||
++histo[ib];
|
||||
}
|
||||
int nhave = 0;
|
||||
int ib = nbuckets - 1;
|
||||
for ( ; ib >= 0; --ib) {
|
||||
nhave += histo[ib];
|
||||
if (nhave >= k) break;
|
||||
}
|
||||
std::vector<llama_token_data> tmp_tokens(nhave);
|
||||
auto ptr = tmp_tokens.data();
|
||||
std::vector<llama_token_data*> bucket_ptrs;
|
||||
bucket_ptrs.reserve(nbuckets - ib);
|
||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||
bucket_ptrs.push_back(ptr);
|
||||
ptr += histo[j];
|
||||
}
|
||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = tmp_tokens.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets-1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
}
|
||||
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
||||
|
||||
std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
||||
|
||||
}
|
||||
candidates->sorted = true;
|
||||
}
|
||||
@@ -8196,6 +8198,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// no need to do anything if there is only one (or zero) candidates
|
||||
if(candidates_p->size <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate maximum possible entropy
|
||||
float max_entropy = -logf(1.0f / candidates_p->size);
|
||||
|
||||
llama_sample_softmax(nullptr, candidates_p);
|
||||
|
||||
// Calculate entropy of the softmax probabilities
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
float prob = candidates_p->data[i].p;
|
||||
if (prob > 0.0f) { // Ensure no log(0)
|
||||
entropy -= prob * logf(prob);
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
|
||||
float normalized_entropy = entropy / max_entropy;
|
||||
|
||||
// Map the normalized entropy to the desired temperature range using the power function
|
||||
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
||||
|
||||
#ifdef DEBUG
|
||||
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
|
||||
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
|
||||
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
|
||||
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
|
||||
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
|
||||
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
||||
#endif
|
||||
|
||||
// Apply the dynamically calculated temperature scaling
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
candidates_p->data[i].logit /= dyn_temp;
|
||||
}
|
||||
|
||||
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
||||
double max_l_double = candidates_p->data[0].logit;
|
||||
double cum_sum_double = 0.0;
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
double p = exp(candidates_p->data[i].logit - max_l_double);
|
||||
candidates_p->data[i].p = p; // Store the scaled probability
|
||||
cum_sum_double += p;
|
||||
}
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
// Print the updated top 25 probabilities after temperature scaling
|
||||
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
||||
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
|
||||
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
@@ -8874,6 +8943,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||
};
|
||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
||||
if (n_expert > 1) {
|
||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||
// tensor name.
|
||||
n_layer /= n_expert;
|
||||
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
||||
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
||||
}
|
||||
if (i_layer < 0 || i_layer >= n_layer) {
|
||||
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
|
||||
}
|
||||
}
|
||||
return std::make_pair(i_layer, n_layer);
|
||||
};
|
||||
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
int nx = tensor->ne[0];
|
||||
@@ -8935,24 +9021,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
} else if (name.find("ffn_down") != std::string::npos) {
|
||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||
int i_layer, n_layer;
|
||||
if (n_expert == 1) {
|
||||
i_layer = qs.i_ffn_down;
|
||||
n_layer = qs.n_ffn_down;
|
||||
} else {
|
||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||
// tensor name.
|
||||
n_layer = qs.n_ffn_down / n_expert;
|
||||
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
|
||||
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
|
||||
}
|
||||
if (i_layer < 0 || i_layer >= n_layer) {
|
||||
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
|
||||
}
|
||||
}
|
||||
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
@@ -9008,13 +9078,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
else if (name.find("ffn_gate") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
|
||||
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
++qs.i_ffn_gate;
|
||||
}
|
||||
else if (name.find("ffn_up") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
|
||||
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
++qs.i_ffn_up;
|
||||
@@ -9964,6 +10038,35 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
// graph inputs
|
||||
{
|
||||
ggml_init_params init_params = {
|
||||
/* .mem_size */ ggml_tensor_overhead()*5,
|
||||
/* .mem_buffer */ nullptr,
|
||||
/* .no_alloc */ true,
|
||||
};
|
||||
ctx->ctx_input = ggml_init(init_params);
|
||||
|
||||
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
||||
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||
|
||||
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
||||
ggml_set_name(ctx->inp_embd, "inp_embd");
|
||||
ggml_set_name(ctx->inp_pos, "inp_pos");
|
||||
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
||||
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
||||
|
||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||
|
||||
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buffer_name(ctx->buf_input),
|
||||
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// scheduler and compute buffers
|
||||
{
|
||||
// buffer types used for the compute buffer of each backend
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
@@ -9990,9 +10093,6 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
// initialize scheduler with the worst-case graph
|
||||
ggml_backend_sched_init_measure(ctx->sched, gf);
|
||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
||||
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||
|
||||
for (ggml_backend_t backend : ctx->backends) {
|
||||
@@ -10001,6 +10101,10 @@ struct llama_context * llama_new_context_with_model(
|
||||
ggml_backend_buffer_name(buf),
|
||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -775,6 +775,14 @@ extern "C" {
|
||||
float p,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
|
||||
LLAMA_API void llama_sample_entropy(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates_p,
|
||||
float min_temp,
|
||||
float max_temp,
|
||||
float exponent_val);
|
||||
|
||||
LLAMA_API void llama_sample_temp(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
|
||||
Reference in New Issue
Block a user