diff options
Diffstat (limited to 'llama.cpp/examples/eval-callback')
| -rw-r--r-- | llama.cpp/examples/eval-callback/CMakeLists.txt | 26 | ||||
| -rw-r--r-- | llama.cpp/examples/eval-callback/README.md | 95 | ||||
| -rw-r--r-- | llama.cpp/examples/eval-callback/eval-callback.cpp | 80 |
3 files changed, 201 insertions, 0 deletions
diff --git a/llama.cpp/examples/eval-callback/CMakeLists.txt b/llama.cpp/examples/eval-callback/CMakeLists.txt new file mode 100644 index 0000000..6439690 --- /dev/null +++ b/llama.cpp/examples/eval-callback/CMakeLists.txt @@ -0,0 +1,26 @@ +set(TARGET llama-eval-callback) +add_executable(${TARGET} eval-callback.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_BUILD_TESTS) + if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") + set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf") + set(MODEL_HASH "SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739") + else() + set(MODEL_NAME "tinyllamas/stories15M-be.Q4_0.gguf") + set(MODEL_HASH "SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d") + endif() + set(MODEL_DEST "${CMAKE_BINARY_DIR}/${MODEL_NAME}") + set(TEST_TARGET test-eval-callback) + add_test(NAME ${TEST_TARGET}-download-model COMMAND ${CMAKE_COMMAND} + -DDEST=${MODEL_DEST} + -DNAME=${MODEL_NAME} + -DHASH=${MODEL_HASH} + -P ${CMAKE_SOURCE_DIR}/cmake/download-models.cmake + ) + set_tests_properties(${TEST_TARGET}-download-model PROPERTIES FIXTURES_SETUP ${TEST_TARGET}-download-model) + add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback -m "${MODEL_DEST}" --prompt hello --seed 42 -ngl 0) + set_tests_properties(${TEST_TARGET} PROPERTIES FIXTURES_REQUIRED ${TEST_TARGET}-download-model) +endif() diff --git a/llama.cpp/examples/eval-callback/README.md b/llama.cpp/examples/eval-callback/README.md new file mode 100644 index 0000000..63a57ad --- /dev/null +++ b/llama.cpp/examples/eval-callback/README.md @@ -0,0 +1,95 @@ +# llama.cpp/examples/eval-callback + +A simple example which demonstrates how to use callback during the inference. +It simply prints to the console all operations and tensor data. + +Usage: + +```shell +llama-eval-callback \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model phi-2-q4_0.gguf \ + --prompt hello \ + --seed 42 \ + -ngl 33 +``` + +Will print: + +```shell +llm_load_tensors: offloaded 33/33 layers to GPU +... +llama_new_context_with_model: n_ctx = 512 +... +llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB +llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB +llama_new_context_with_model: graph nodes = 1225 +llama_new_context_with_model: graph splits = 2 +ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.0181, 0.0272, 0.0272, ...], + ], + ] +ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -0.6989, 1.0636, 1.0636, ...], + ], + ] +ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1800, 0.2817, 0.2632, ...], + ], + ] +ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1863, 0.2970, 0.2604, ...], + ], + ] +ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1238, 1.2876, -1.8086, ...], + ], + ] +ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +``` diff --git a/llama.cpp/examples/eval-callback/eval-callback.cpp b/llama.cpp/examples/eval-callback/eval-callback.cpp new file mode 100644 index 0000000..bd58734 --- /dev/null +++ b/llama.cpp/examples/eval-callback/eval-callback.cpp @@ -0,0 +1,80 @@ +#include "arg.h" +#include "common.h" +#include "debug.h" +#include "log.h" +#include "llama.h" +#include "llama-cpp.h" +#include <string> +#include <vector> + +static bool run(llama_context * ctx, const common_params & params) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); + + std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos); + + if (tokens.empty()) { + LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__); + return false; + } + + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { + LOG_ERR("%s : failed to eval\n", __func__); + return false; + } + + return true; +} + +int main(int argc, char ** argv) { + base_callback_data cb_data; + + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + return 1; + } + + common_init(); + + llama_backend_init(); + llama_numa_init(params.numa); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = common_debug_cb_eval<false>; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + // init + auto llama_init = common_init_from_params(params); + + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); + + if (model == nullptr || ctx == nullptr) { + LOG_ERR("%s : failed to init\n", __func__); + return 1; + } + + // print system information + { + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); + } + + bool OK = run(ctx, params); + if (!OK) { + return 1; + } + + LOG("\n"); + llama_perf_context_print(ctx); + + llama_backend_free(); + + return 0; +} |
