diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/examples/simple | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/examples/simple')
| -rw-r--r-- | llama.cpp/examples/simple/CMakeLists.txt | 5 | ||||
| -rw-r--r-- | llama.cpp/examples/simple/README.md | 21 | ||||
| -rw-r--r-- | llama.cpp/examples/simple/simple.cpp | 220 |
3 files changed, 246 insertions, 0 deletions
diff --git a/llama.cpp/examples/simple/CMakeLists.txt b/llama.cpp/examples/simple/CMakeLists.txt new file mode 100644 index 0000000..104ecab --- /dev/null +++ b/llama.cpp/examples/simple/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-simple) +add_executable(${TARGET} simple.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/llama.cpp/examples/simple/README.md b/llama.cpp/examples/simple/README.md new file mode 100644 index 0000000..937008b --- /dev/null +++ b/llama.cpp/examples/simple/README.md @@ -0,0 +1,21 @@ +# llama.cpp/example/simple + +The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. + +```bash +./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" + +... + +main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32 + + Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old + +main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s + +llama_print_timings: load time = 579.15 ms +llama_print_timings: sample time = 0.72 ms / 28 runs ( 0.03 ms per token, 38888.89 tokens per second) +llama_print_timings: prompt eval time = 655.63 ms / 10 tokens ( 65.56 ms per token, 15.25 tokens per second) +llama_print_timings: eval time = 2180.97 ms / 27 runs ( 80.78 ms per token, 12.38 tokens per second) +llama_print_timings: total time = 2891.13 ms +``` diff --git a/llama.cpp/examples/simple/simple.cpp b/llama.cpp/examples/simple/simple.cpp new file mode 100644 index 0000000..d09771d --- /dev/null +++ b/llama.cpp/examples/simple/simple.cpp @@ -0,0 +1,220 @@ +#include "llama.h" +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> + +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]); + printf("\n"); +} + +int main(int argc, char ** argv) { + // path to the model gguf file + std::string model_path; + // prompt to generate text from + std::string prompt = "Hello my name is"; + // number of layers to offload to the GPU + int ngl = 99; + // number of tokens to predict + int n_predict = 32; + + // parse command line arguments + + { + int i = 1; + for (; i < argc; i++) { + if (strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + model_path = argv[++i]; + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-n") == 0) { + if (i + 1 < argc) { + try { + n_predict = std::stoi(argv[++i]); + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-ngl") == 0) { + if (i + 1 < argc) { + try { + ngl = std::stoi(argv[++i]); + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else { + // prompt starts here + break; + } + } + if (model_path.empty()) { + print_usage(argc, argv); + return 1; + } + if (i < argc) { + prompt = argv[i++]; + for (; i < argc; i++) { + prompt += " "; + prompt += argv[i]; + } + } + } + + // load dynamic backends + + ggml_backend_load_all(); + + // initialize the model + + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); + + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + // tokenize the prompt + + // find the number of tokens in the prompt + const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true); + + // allocate space for the tokens and tokenize the prompt + std::vector<llama_token> prompt_tokens(n_prompt); + if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { + fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__); + return 1; + } + + // initialize the context + + llama_context_params ctx_params = llama_context_default_params(); + // n_ctx is the context size + ctx_params.n_ctx = n_prompt + n_predict - 1; + // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode + ctx_params.n_batch = n_prompt; + // enable performance counters + ctx_params.no_perf = false; + + llama_context * ctx = llama_init_from_model(model, ctx_params); + + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + // initialize the sampler + + auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + + // print the prompt token-by-token + + for (auto id : prompt_tokens) { + char buf[128]; + int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; + } + std::string s(buf, n); + printf("%s", s.c_str()); + } + + // prepare a batch for the prompt + + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + + if (llama_model_has_encoder(model)) { + if (llama_encode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); + } + + batch = llama_batch_get_one(&decoder_start_token_id, 1); + } + + // main loop + + const auto t_main_start = ggml_time_us(); + int n_decode = 0; + llama_token new_token_id; + + for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) { + // evaluate the current batch with the transformer model + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + + n_pos += batch.n_tokens; + + // sample the next token + { + new_token_id = llama_sampler_sample(smpl, ctx, -1); + + // is it an end of generation? + if (llama_vocab_is_eog(vocab, new_token_id)) { + break; + } + + char buf[128]; + int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; + } + std::string s(buf, n); + printf("%s", s.c_str()); + fflush(stdout); + + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1); + + n_decode += 1; + } + } + + printf("\n"); + + const auto t_main_end = ggml_time_us(); + + fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); + + fprintf(stderr, "\n"); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); + fprintf(stderr, "\n"); + + llama_sampler_free(smpl); + llama_free(ctx); + llama_model_free(model); + + return 0; +} |
