convert : add BPE pre-tokenization for DBRX (#7132 )

* Add BPE pre-tokenization for DBRX. * Add vocab GGUFs. * Remove test. * Remove GGUFs.
py : also print the normalizers
2026-06-16 18:47:39 +02:00 · 2024-05-08 13:43:23 +03:00 · 2024-05-08 12:47:07 +03:00 · 2024-05-08 10:54:39 +02:00
5 changed files with 20 additions and 6 deletions
@@ -68,6 +68,7 @@ models = [
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
+    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
 ]

 # make directory "models/tokenizers" if it doesn't exist
@@ -151,6 +152,8 @@ for model in models:
    # print the "pre_tokenizer" content from the tokenizer.json
    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
        cfg = json.load(f)
+        normalizer = cfg["normalizer"]
+        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
        pre_tokenizer = cfg["pre_tokenizer"]
        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))

@@ -317,6 +317,9 @@ class Model(ABC):
        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
            # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
            res = "olmo"
+        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
+            # ref: https://huggingface.co/databricks/dbrx-instruct
+            res = "dbrx"

        if res is None:
            logger.warning("\n")
@@ -4394,6 +4394,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "olmo") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
+            } else if (
+                tokenizer_pre == "dbrx") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@@ -12200,6 +12203,7 @@ struct llm_tokenizer_bpe {
            case LLAMA_VOCAB_TYPE_BPE:
                switch (vocab.type_pre) {
                    case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
                        word_collection = unicode_regex_split(text, {
                            // original regex from tokenizer.json
                            //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -82,6 +82,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
        LLAMA_VOCAB_PRE_TYPE_OLMO           = 10,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 11,
    };

    // note: these values should be synchronized with ggml_rope
@@ -93,11 +93,14 @@ help_s = (
    "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
 )
 parser.add_argument("-s", "--show", help=help_s)
+parser.add_argument("--verbose", action="store_true", help="increase output verbosity")

 known_args, unknown_args = parser.parse_known_args()

+logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
+
 if unknown_args:
-    logger.error(f"Received unknown args: {unknown_args}.")
+    logger.error(f"Received unknown args: {unknown_args}.\n")
    parser.print_help()
    sys.exit(1)

@@ -110,7 +113,7 @@ if input_file is None:
        input_file = sqlite_files[0]

 if input_file is None:
-    logger.error("Cannot find a suitable input file, please provide one.")
+    logger.error("Cannot find a suitable input file, please provide one.\n")
    parser.print_help()
    sys.exit(1)

@@ -202,12 +205,12 @@ elif repo is not None:
    hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)

    if hexsha8_baseline is None:
-        logger.error("No baseline was provided and did not find data for any master branch commits.")
+        logger.error("No baseline was provided and did not find data for any master branch commits.\n")
        parser.print_help()
        sys.exit(1)
 else:
    logger.error("No baseline was provided and the current working directory "
-                 "is not part of a git repository from which a baseline could be inferred.")
+                 "is not part of a git repository from which a baseline could be inferred.\n")
    parser.print_help()
    sys.exit(1)

@@ -238,7 +241,7 @@ elif repo is not None:
            break

    if hexsha8_compare is None:
-        logger.error("No compare target was provided and did not find data for any non-master commits.")
+        logger.error("No compare target was provided and did not find data for any non-master commits.\n")
        parser.print_help()
        sys.exit(1)
 else:
@@ -361,7 +364,7 @@ if "gpu_info" in show:
 headers  = [PRETTY_NAMES[p] for p in show]
 headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]

-logger.info(tabulate(
+print(tabulate( # noqa: NP100
    table,
    headers=headers,
    floatfmt=".2f",
Author	SHA1	Message	Date
DAN™	4cd621c26d	convert : add BPE pre-tokenization for DBRX (#7132 ) * Add BPE pre-tokenization for DBRX. * Add vocab GGUFs. * Remove test. * Remove GGUFs.	2024-05-08 13:43:23 +03:00
Georgi Gerganov	7e0b6a7b3b	py : also print the normalizers	2024-05-08 12:47:07 +03:00
Brian	acdce3cdef	compare-llama-bench.py: add missing basicConfig (#7138 ) * compare-llama-bench.py: add missing basicConfig * compare-llama-bench.py: Add line break between error message and print_help() * Add regular print() markdown table	2024-05-08 10:54:39 +02:00