You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
classLLM {
struct {
std::string model = "7B-ggml-model-quant.bin";
int32_t seed; // RNG seedint32_t n_threads = static_cast<int32_t>(std::thread::hardware_concurrency()) / 4;
int32_t n_ctx = 2024; // Context sizeint32_t n_batch = 8; // Batch size, unused for nowint32_t top_k = 40;
float top_p = 0.5f;
float temp = 0.72f;
} params;
structState {
std::string prompt;
std::vector<llama_token> embd;
int n_ctx;
std::string last_result;
} state;
llama_context *ctx = nullptr;
std::mutex lock;
voidinit() {
// Get llama parametersauto lparams = llama_context_default_params();
lparams.seed = params.seed;
lparams.n_ctx = 2024;
// Create context
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (!ctx) {
throwException("Failed to initialize llama from file");
}
// Initialize some variables
state.n_ctx = llama_n_ctx(ctx);
}
public:structException : publicstd::runtime_error {
using std::runtime_error::runtime_error;
};
structContextLengthException : publicException {
ContextLengthException() : Exception("Max. context length exceeded") {}
};
LLM(int32_t seed = 0) {
// Set random seed
params.seed = seed?seed:time(NULL);
// Initialize llamainit();
}
~LLM() {
if (ctx) llama_free(ctx);
}
voidappend(std::string prompt) {
std::scoped_lock L(lock);
// Check if prompt was emptyconstbool was_empty = state.prompt.empty();
// Append to current prompt
state.prompt.append(prompt);
// Resize buffer for tokensconstauto old_token_count = state.embd.size();
state.embd.resize(old_token_count+state.prompt.size()+1);
// Run tokenizerconstauto token_count = llama_tokenize(ctx, prompt.data(), state.embd.data()+old_token_count, state.embd.size()-old_token_count, was_empty);
state.embd.resize(old_token_count+token_count);
// Make sure limit is far from being hitif (state.embd.size() > state.n_ctx-6) {
// Yup. *this MUST be decomposed now.throwContextLengthException();
}
// Evaluate new tokens// TODO: Larger batch size
std::cout << "Context size: " << old_token_count << '+' << token_count << '=' << state.embd.size() << '/' << state.n_ctx << std::endl;
for (int it = old_token_count; it != state.embd.size(); it++) {
std::cout << llama_token_to_str(ctx, state.embd.data()[it]) << std::flush;
llama_eval(ctx, state.embd.data()+it, 1, it, params.n_threads);
}
std::cout << std::endl;
}
std::string run(std::string_view end) {
std::scoped_lock L(lock);
std::string fres;
// Loop until doneboolabort = false;
while (!abort && !fres.ends_with(end)) {
// Sample top p and top kconstauto id = llama_sample_top_p_top_k(ctx, nullptr, 0, params.top_k, params.top_p, params.temp, 1.0f);
// Add token
state.embd.push_back(id);
// Get token as stringconstauto str = llama_token_to_str(ctx, id);
// Debug
std::cout << str << std::flush;
// Append string to function result
fres.append(str);
// Evaluate token// TODO: Respect batch sizellama_eval(ctx, state.embd.data()+state.embd.size()-1, 1, state.embd.size()-1, params.n_threads);
}
// Create final string
state.prompt.append(fres);
fres = std::string(fres.data(), fres.size()-end.size());
// Return final stringreturn fres;
}
};
And the results I am getting are good... until they aren't. It starts repeating output very frequently, which it doesn't in the main example with the same settings.
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Hey, I have written this C++ class:
And the results I am getting are good... until they aren't. It starts repeating output very frequently, which it doesn't in the main example with the same settings.
Any ideas? What may I be doing wrong?
Thanks
Niansa
Beta Was this translation helpful? Give feedback.
All reactions