Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests : add test-tokenizer-0.sh #7036

Merged
merged 15 commits into from
May 4, 2024
Merged
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude = examples/*,examples/*/**,*/**/__init__.py
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
Expand Down
38 changes: 21 additions & 17 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from enum import IntEnum, auto
from transformers import AutoTokenizer

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")


Expand Down Expand Up @@ -62,6 +63,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
]

# make directory "models/tokenizers" if it doesn't exist
Expand Down Expand Up @@ -158,8 +160,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()

print(f"chktok: {{chktok}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")

res = None

Expand All @@ -168,22 +170,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# don't edit the hashes manually!
{src_ifs}
if res is None:
print("\\n")
print("**************************************************************************************")
print("** WARNING: The BPE pre-tokenizer was not recognized!")
print("** There are 2 possible reasons for this:")
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
print("** - the pre-tokenization config has changed upstream")
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
print("**")
print(f"** chkhsh: {{chkhsh}}")
print("**************************************************************************************")
print("\\n")
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

print(f"tokenizer.ggml.pre: {{repr(res)}}")
print(f"chkhsh: {{chkhsh}}")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")

return res
"""
Expand All @@ -197,6 +199,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# generate tests for each tokenizer model

tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
Expand Down Expand Up @@ -281,6 +285,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
for model in models:
name = model["name"]

logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100

logger.info("\n")
5 changes: 4 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"

if res is None:
logger.warning("\n")
Expand All @@ -324,7 +327,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
logger.warning("\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

logger.debug(f"tokenizer.ggml.pre: {res}")
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
logger.debug(f"chkhsh: {chkhsh}")

return res
Expand Down
21 changes: 15 additions & 6 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4383,6 +4383,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "gpt-2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -11952,7 +11955,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
auto buf = token_data.text.substr(3, 2);
Expand Down Expand Up @@ -12212,14 +12215,13 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
"\\p{N}",
});
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON:
word_collection = unicode_regex_split(text, {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
});
break;
Expand All @@ -12235,6 +12237,12 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
Expand Down Expand Up @@ -17466,9 +17474,10 @@ int32_t llama_tokenize(

static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));

const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
}

return decoded_text;
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
};

// note: these values should be synchronized with ggml_rope
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-bert-bge.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-bert-bge.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
29464 2094 1018 1092 2706
11865 17875



Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-deepseek-coder.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-deepseek-coder.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1050 207 19 207 19192 4217
37 32009 71 6247

207
243
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-deepseek-llm.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-deepseek-llm.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1052 207 19 207 19109 4223
37 100014 71 6245

207
243
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-falcon.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-falcon.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
878 204 31 3068 133 2137
28611 132 30042

204
258
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-gpt-2.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-gpt-2.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
798 604 25208 1933
37 9116 71 11751

220
220 220
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-llama-bpe.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-llama-bpe.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
1142 220 19 220 27154 4038
37 51853 261

220
256
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-llama-spm.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-llama-spm.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261

259
1678
Expand Down
4 changes: 4 additions & 0 deletions models/ggml-vocab-mpt.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-mpt.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
728 577 24142 2607
39 26288 6554

209
50276
Expand Down
Binary file modified models/ggml-vocab-phi-3.gguf
Binary file not shown.
4 changes: 4 additions & 0 deletions models/ggml-vocab-phi-3.gguf.inp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__

__ggml_vocab_test__

Expand Down
2 changes: 2 additions & 0 deletions models/ggml-vocab-phi-3.gguf.out
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
474 287 29871 29946 29871 30226 7378
383 4000 261

259
1678
Expand Down
Binary file modified models/ggml-vocab-refact.gguf
Binary file not shown.