-
Hey! I am attempting to use the new C API from C++, however I must be doing something significantly wrong since the results are very very weird (literally nonsense). Here is the code: class LLM {
struct Exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
struct {
std::string model = "7B-ggml-model-quant.bin";
int32_t seed; // RNG seed
int32_t n_threads = static_cast<int32_t>(std::thread::hardware_concurrency()) / 4;
int32_t n_ctx = 2024; // Context size
int32_t n_batch = 8; // Batch size
int32_t top_k = 40;
float top_p = 0.5f;
float temp = 0.81f;
} params;
struct State {
std::string prompt;
std::vector<llama_token> embd;
int n_ctx;
} state;
llama_context *ctx;
std::mutex lock;
void init() {
// Get llama parameters
puts("30");
auto lparams = llama_context_default_params();
lparams.seed = params.seed;
lparams.n_ctx = 2024;
// Create context
puts("31");
ctx = llama_init_from_file(params.model.c_str(), lparams);
puts("32");
// Initialize some variables
state.n_ctx = llama_n_ctx(ctx);
}
public:
LLM(int32_t seed = 0) {
// Set random seed
params.seed = seed?seed:time(NULL);
// Initialize llama
init();
}
void append(const std::string& prompt) {
std::scoped_lock L(lock);
// Check if prompt was empty
const bool was_empty = state.prompt.empty();
// Append to current prompt
printf("ddd %s\n", prompt.c_str());
state.prompt.append(prompt);
// Resize buffer for tokens
puts("cccc");
const auto old_token_count = state.embd.size();
state.embd.resize(old_token_count+state.prompt.size()+1);
// Run tokenizer
puts("bbbb");
const auto token_count = llama_tokenize(ctx, prompt.data(), state.embd.data()+old_token_count, state.embd.size()-old_token_count, was_empty);
state.embd.resize(old_token_count+token_count);
// Evaluate new tokens
// TODO: Larger batch size
printf("aaa %lu+%d=%lu\n", old_token_count, token_count, old_token_count+token_count);
for (int it = old_token_count; it != old_token_count+token_count; it++) {
printf("aaa %i %s\n", it, llama_token_to_str(ctx, state.embd.data()[it]));
llama_eval(ctx, state.embd.data()+it, 1, it, params.n_threads);
}
}
std::string run(std::string_view end) {
std::scoped_lock L(lock);
std::string fres;
// Loop until done
puts("6");
bool abort = false;
while (!abort && !fres.ends_with(end)) {
// Sample top p and top k
const auto id = llama_sample_top_p_top_k(ctx, nullptr, 0, params.top_k, params.top_p, params.temp, 1.0f);
// Add token
state.embd.push_back(id);
// Get token as string
const auto str = llama_token_to_str(ctx, id);
// Debug
std::cout << str << std::flush;
// Append string to function result
fres.append(str);
// Evaluate token
// TODO: Larger batch size
llama_eval(ctx, state.embd.data()+state.embd.size()-1, 1, state.embd.size()-1, params.n_threads);
}
// Return final string
puts("23");
state.prompt.append(fres);
return std::string(fres.data(), fres.size()-end.size());
}
}; It'd be amazing if someone with a bit more knowledge than me could look over this and maybe give me some tips and hints :-) Btw: the code is called from multiple threads, but never at the same time thanks to the locks. Thanks |
Beta Was this translation helpful? Give feedback.
Answered by
niansa
Mar 23, 2023
Replies: 1 comment
-
Nevermind my code is correct. I had a space at the end of the prompt. |
Beta Was this translation helpful? Give feedback.
0 replies
Answer selected by
niansa
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nevermind my code is correct. I had a space at the end of the prompt.