Skip to content

Commit

Permalink
tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
Browse files Browse the repository at this point in the history
* tests : add test-tokenizer-0.sh

* unicode : add all unicode number ranges

* starcoder : fix pre-tokenizer

* tests : add test that fails with DeepSeek tokenizers

* falcon : fix regex

* unicode : regenerate unicode tables

* refact : add tokenizer model

* lint : fix

* tests : disable failing tests

ggml-ci

* refact : add tests files

ggml-ci

* convert : print -> logging

ggml-ci

* lint : fix

* unicode : digit -> number

* phi-3 : update
  • Loading branch information
ggerganov committed May 4, 2024
1 parent a2ac89d commit 92139b9
Show file tree
Hide file tree
Showing 41 changed files with 903 additions and 719 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude = examples/*,examples/*/**,*/**/__init__.py
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
Expand Down
38 changes: 21 additions & 17 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from enum import IntEnum, auto
from transformers import AutoTokenizer

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")


Expand Down Expand Up @@ -62,6 +63,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
]

# make directory "models/tokenizers" if it doesn't exist
Expand Down Expand Up @@ -158,8 +160,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"chktok: {{chktok}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")
res = None
Expand All @@ -168,22 +170,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# don't edit the hashes manually!
{src_ifs}
if res is None:
print("\\n")
print("**************************************************************************************")
print("** WARNING: The BPE pre-tokenizer was not recognized!")
print("** There are 2 possible reasons for this:")
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
print("** - the pre-tokenization config has changed upstream")
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
print("**")
print(f"** chkhsh: {{chkhsh}}")
print("**************************************************************************************")
print("\\n")
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
print(f"tokenizer.ggml.pre: {{repr(res)}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")
return res
"""
Expand All @@ -197,6 +199,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# generate tests for each tokenizer model

tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
Expand Down Expand Up @@ -281,6 +285,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
for model in models:
name = model["name"]

logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100

logger.info("\n")
5 changes: 4 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"

if res is None:
logger.warning("\n")
Expand All @@ -324,7 +327,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
logger.warning("\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

logger.debug(f"tokenizer.ggml.pre: {res}")
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
logger.debug(f"chkhsh: {chkhsh}")

return res
Expand Down
21 changes: 15 additions & 6 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4383,6 +4383,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "gpt-2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -11952,7 +11955,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
auto buf = token_data.text.substr(3, 2);
Expand Down Expand Up @@ -12212,14 +12215,13 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
"\\p{N}",
});
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON:
word_collection = unicode_regex_split(text, {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
});
break;
Expand All @@ -12235,6 +12237,12 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
Expand Down Expand Up @@ -17466,9 +17474,10 @@ int32_t llama_tokenize(

static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));

const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
}

return decoded_text;
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
};

// note: these values should be synchronized with ggml_rope
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-bert-bge.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-bert-bge.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
29464 2094 1018 1092 2706
11865 17875



Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-deepseek-coder.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-deepseek-coder.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1050 207 19 207 19192 4217
37 32009 71 6247

207
243
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-deepseek-llm.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-deepseek-llm.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1052 207 19 207 19109 4223
37 100014 71 6245

207
243
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-falcon.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-falcon.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
878 204 31 3068 133 2137
28611 132 30042

204
258
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-gpt-2.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-gpt-2.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
798 604 25208 1933
37 9116 71 11751

220
220 220
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-llama-bpe.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-llama-bpe.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1142 220 19 220 27154 4038
37 51853 261

220
256
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-llama-spm.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-llama-spm.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261

259
1678
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-mpt.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-mpt.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
728 577 24142 2607
39 26288 6554

209
50276
Expand Down
Binary file modified models/ggml-vocab-phi-3.gguf
Binary file not shown.
4 changes: 4 additions & 0 deletions models/ggml-vocab-phi-3.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-phi-3.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261

259
1678
Expand Down
Binary file modified models/ggml-vocab-refact.gguf
Binary file not shown.

0 comments on commit 92139b9

Please sign in to comment.