What may I be doing wrong with the C API? #516

niansa · 2023-03-26T02:12:08Z

niansa
Mar 26, 2023

Hey, I have written this C++ class:

class LLM {
    struct {
        std::string model = "7B-ggml-model-quant.bin";

        int32_t seed; // RNG seed
        int32_t n_threads = static_cast<int32_t>(std::thread::hardware_concurrency()) / 4;
        int32_t n_ctx = 2024; // Context size
        int32_t n_batch = 8; // Batch size, unused for now

        int32_t top_k = 40;
        float   top_p = 0.5f;
        float   temp  = 0.72f;
    } params;

    struct State {
        std::string prompt;
        std::vector<llama_token> embd;
        int n_ctx;
        std::string last_result;
    } state;

    llama_context *ctx = nullptr;
    std::mutex lock;
    
    void init() {
        // Get llama parameters
        auto lparams = llama_context_default_params();
        lparams.seed = params.seed;
        lparams.n_ctx = 2024;

        // Create context
        ctx = llama_init_from_file(params.model.c_str(), lparams);
        if (!ctx) {
            throw Exception("Failed to initialize llama from file");
        }

        // Initialize some variables
        state.n_ctx = llama_n_ctx(ctx);
    }

public:
    struct Exception : public std::runtime_error {
        using std::runtime_error::runtime_error;
    };
    struct ContextLengthException : public Exception {
        ContextLengthException() : Exception("Max. context length exceeded") {}
    };


    LLM(int32_t seed = 0) {
        // Set random seed
        params.seed = seed?seed:time(NULL);

        // Initialize llama
        init();
    }
    ~LLM() {
        if (ctx) llama_free(ctx);
    }

    void append(std::string prompt) {
        std::scoped_lock L(lock);

        // Check if prompt was empty
        const bool was_empty = state.prompt.empty();

        // Append to current prompt
        state.prompt.append(prompt);

        // Resize buffer for tokens
        const auto old_token_count = state.embd.size();
        state.embd.resize(old_token_count+state.prompt.size()+1);

        // Run tokenizer
        const auto token_count = llama_tokenize(ctx, prompt.data(), state.embd.data()+old_token_count, state.embd.size()-old_token_count, was_empty);
        state.embd.resize(old_token_count+token_count);

        // Make sure limit is far from being hit
        if (state.embd.size() > state.n_ctx-6) {
            // Yup. *this MUST be decomposed now.
            throw ContextLengthException();
        }

        // Evaluate new tokens
        // TODO: Larger batch size
        std::cout << "Context size: " << old_token_count << '+' << token_count << '=' << state.embd.size() << '/' << state.n_ctx << std::endl;
        for (int it = old_token_count; it != state.embd.size(); it++) {
            std::cout << llama_token_to_str(ctx, state.embd.data()[it]) << std::flush;
            llama_eval(ctx, state.embd.data()+it, 1, it, params.n_threads);
        }
        std::cout << std::endl;
    }

    std::string run(std::string_view end) {
        std::scoped_lock L(lock);
        std::string fres;

        // Loop until done
        bool abort = false;
        while (!abort && !fres.ends_with(end)) {
            // Sample top p and top k
            const auto id = llama_sample_top_p_top_k(ctx, nullptr, 0, params.top_k, params.top_p, params.temp, 1.0f);

            // Add token
            state.embd.push_back(id);

            // Get token as string
            const auto str = llama_token_to_str(ctx, id);

            // Debug
            std::cout << str << std::flush;

            // Append string to function result
            fres.append(str);

            // Evaluate token
            //  TODO: Respect batch size
            llama_eval(ctx, state.embd.data()+state.embd.size()-1, 1, state.embd.size()-1, params.n_threads);
        }

        // Create final string
        state.prompt.append(fres);
        fres = std::string(fres.data(), fres.size()-end.size());

        // Return final string
        return fres;
    }
};

And the results I am getting are good... until they aren't. It starts repeating output very frequently, which it doesn't in the main example with the same settings.

Any ideas? What may I be doing wrong?

Thanks
Niansa

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

What may I be doing wrong with the C API? #516

{{title}}

Replies: 0 comments

Select a reply

What may I be doing wrong with the C API? #516

niansa Mar 26, 2023

Replies: 0 comments

niansa
Mar 26, 2023