Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new tokenizer-verifier tool to check gguf tokenizer parameters #6988

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ else()
endif()
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(tokenizer-verifier)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
Expand Down
5 changes: 5 additions & 0 deletions examples/tokenizer-verifier/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET tokenizer-verifier)
add_executable(${TARGET} tokenizer-verifier.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
78 changes: 78 additions & 0 deletions examples/tokenizer-verifier/tokenizer-verifier.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#include "common.h"
#include "llama.h"

#include <cstdio>
#include <string>
#include <vector>

static int unicode_to_utf8(int codepoint, char *dest) {
// https://stackoverflow.com/a/4609989 — who needs iconv?
if (codepoint < 0x80) {
*dest++ = codepoint;
} else if (codepoint < 0x800) {
*dest++ = 192 + codepoint / 64, *dest++ = 128 + codepoint % 64;
// we also support reserved utf-16 surrogates 0xd800 - 0xdfff for simplicity
} else if (codepoint < 0x10000) {
*dest++ = 224 + codepoint / 4096, *dest++ = 128 + codepoint / 64 % 64,
*dest++ = 128 + codepoint % 64;
} else if (codepoint < 0x110000) {
*dest++ = 240 + codepoint / 262144, *dest++ = 128 + codepoint / 4096 % 64,
*dest++ = 128 + codepoint / 64 % 64, *dest++ = 128 + codepoint % 64;
} else {
return 1;
}
return 0;
}

int main(int argc, char **argv) {
if (argc < 2) {
printf("usage: %s MODEL_PATH\n", argv[0]);
return 1;
}

const char *model_path = argv[1];

llama_backend_init();

llama_model_params model_params = llama_model_default_params();
model_params.vocab_only = true;
llama_model *model = llama_load_model_from_file(model_path, model_params);

std::vector<llama_token> tokens;

int failed_ascii = 0;
int ascii_max = 127;
for (int c = 0; c <= ascii_max; c++) {
const char prompt[] = {(char)c, '\0'};
try {
tokens = ::llama_tokenize(model, prompt, false, true);
} catch (...) {
printf("%#x -> Tokenization failed for char '%c'\n", c, (char)c);
failed_ascii += 1;
continue;
}
}
printf("%d/%d 7-bit ascii characters could not be tokenized\n", failed_ascii, ascii_max);

int failed_unicode = 0;
int utf8_max = 0x10FFFF;
// Now let's do all potential codepoints
for (int cp = 0; cp <= utf8_max; cp++) {
char buf[5] = {};
if (unicode_to_utf8(cp, buf)) {
printf("Impossible to encode codepoint %#x\n", cp);
continue;
}
try {
tokens = ::llama_tokenize(model, buf, false, true);
} catch (...) {
// printf("%#x -> Tokenization failed for codepoint '%s'\n", cp, buf);
failed_unicode += 1;
continue;
}
}
printf("%d/%d potential unicode codepoints not tokenized\n", failed_unicode,
utf8_max);

return (failed_ascii != 0 || failed_unicode != 0);
}