Skip to content

Commit

Permalink
tests : add test-tokenizer-0.sh + fix some tokenizers (ggerganov#7036)
Browse files Browse the repository at this point in the history
* tests : add test-tokenizer-0.sh

* unicode : add all unicode number ranges

* starcoder : fix pre-tokenizer

* tests : add test that fails with DeepSeek tokenizers

* falcon : fix regex

* unicode : regenerate unicode tables

* refact : add tokenizer model

* lint : fix

* tests : disable failing tests

ggml-ci

* refact : add tests files

ggml-ci

* convert : print -> logging

ggml-ci

* lint : fix

* unicode : digit -> number

* phi-3 : update
  • Loading branch information
ggerganov authored and teleprint-me committed May 7, 2024
1 parent 2782855 commit b619d2b
Show file tree
Hide file tree
Showing 41 changed files with 922 additions and 728 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude = examples/*,examples/*/**,*/**/__init__.py
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
Expand Down
48 changes: 25 additions & 23 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import requests
from transformers import AutoTokenizer

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")


Expand Down Expand Up @@ -69,8 +70,8 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "mistral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
{"name": "mixtral-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
{"name": "mixtral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
]

# make directory "models/tokenizers" if it doesn't exist
Expand Down Expand Up @@ -167,8 +168,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"chktok: {{chktok}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")
res = None
Expand All @@ -177,22 +178,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# don't edit the hashes manually!
{src_ifs}
if res is None:
print("\\n")
print("**************************************************************************************")
print("** WARNING: The BPE pre-tokenizer was not recognized!")
print("** There are 2 possible reasons for this:")
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
print("** - the pre-tokenization config has changed upstream")
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
print("**")
print(f"** chkhsh: {{chkhsh}}")
print("**************************************************************************************")
print("\\n")
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
print(f"tokenizer.ggml.pre: {{repr(res)}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")
return res
"""
Expand All @@ -206,6 +207,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# generate tests for each tokenizer model

tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
Expand Down Expand Up @@ -285,17 +288,16 @@ def get_vocab_base_pre(self, tokenizer) -> str:

# generate commands for creating vocab files
shscript = "#!/usr/bin/env bash\n\n"
logging.info("\nRun the following commands to generate the vocab files for testing:\n")
with open("generate-vocab.sh", "w", encoding="utf-8") as f:
f.writelines(shscript)
logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")

for model in models:
name = model["name"]
tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only\n"
shscript += tmpline
logging.info(tmpline)
logging.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
logging.info(tmpline.strip())

with open("generate-vocab.sh", "w", encoding="utf-8") as f:
f.writelines(shscript)
logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")

logging.info("Run the following commands to generate the vocab files for testing:")
logging.info("Enable execution: chmod +x generate-vocab.sh")
Expand Down
23 changes: 19 additions & 4 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,27 @@

from __future__ import annotations

import logging
import argparse
import contextlib
import json
import logging
import os
import re
import sys
from abc import ABC, abstractmethod
from enum import IntEnum
from pathlib import Path
from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Callable,
ContextManager,
Iterator,
Sequence,
TypeVar,
cast,
)

import numpy as np
import torch
Expand Down Expand Up @@ -323,6 +332,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2":
# ref: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
res = "mixtral-bpe"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"

if res is None:
logger.warning("\n")
Expand All @@ -339,7 +354,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
logger.warning("\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

logger.debug(f"tokenizer.ggml.pre: {res}")
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
logger.debug(f"chkhsh: {chkhsh}")

return res
Expand Down
21 changes: 15 additions & 6 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4383,6 +4383,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "gpt-2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -11952,7 +11955,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
auto buf = token_data.text.substr(3, 2);
Expand Down Expand Up @@ -12212,14 +12215,13 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
"\\p{N}",
});
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON:
word_collection = unicode_regex_split(text, {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
});
break;
Expand All @@ -12235,6 +12237,12 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
Expand Down Expand Up @@ -17466,9 +17474,10 @@ int32_t llama_tokenize(

static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));

const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
}

return decoded_text;
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
};

// note: these values should be synchronized with ggml_rope
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-bert-bge.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-bert-bge.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
29464 2094 1018 1092 2706
11865 17875



Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-deepseek-coder.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-deepseek-coder.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1050 207 19 207 19192 4217
37 32009 71 6247

207
243
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-deepseek-llm.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-deepseek-llm.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1052 207 19 207 19109 4223
37 100014 71 6245

207
243
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-falcon.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-falcon.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
878 204 31 3068 133 2137
28611 132 30042

204
258
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-gpt-2.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-gpt-2.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
798 604 25208 1933
37 9116 71 11751

220
220 220
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-llama-bpe.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-llama-bpe.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1142 220 19 220 27154 4038
37 51853 261

220
256
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-llama-spm.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-llama-spm.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261

259
1678
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-mpt.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-mpt.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
728 577 24142 2607
39 26288 6554

209
50276
Expand Down
Binary file modified models/ggml-vocab-phi-3.gguf
Binary file not shown.
4 changes: 4 additions & 0 deletions models/ggml-vocab-phi-3.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-phi-3.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261

259
1678
Expand Down
Binary file modified models/ggml-vocab-refact.gguf
Binary file not shown.

0 comments on commit b619d2b

Please sign in to comment.