Commit a1a595a

Simple Vector Database

Author	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-13 03:29:25 +0100
Committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-13 03:29:25 +0100
Commit	`a1a595a3305727d30e16e856f4faf95980643e1c` (patch)

`-rw-r--r--`	Dockerfile	3
`-rw-r--r--`	Makefile	6
`-rw-r--r--`	context.txt	33
`-rw-r--r--`	prompt.c	408
`-rw-r--r--`	vectordb.c	92
`-rw-r--r--`	vectordb.h	29

6 files changed, 515 insertions, 56 deletions

diff --git a/Dockerfile b/Dockerfile
...
4	RUN apt-get install -y libstdc++6	4	RUN apt-get install -y libstdc++6
5		5
6	COPY prompt /app/prompt	6	COPY prompt /app/prompt
		7	COPY context.txt /app/context.txt
7	COPY models/ /app/models/	8	COPY models/ /app/models/
8		9
9	# ENTRYPOINT ["bash"]	10	ENTRYPOINT ["bash"]

diff --git a/Makefile b/Makefile
...
12		12
13	help: .help	13	help: .help
14		14
15	prompt: prompt.c models.h # Build prompt binary for testing	15	prompt: prompt.c vectordb.c models.h # Build prompt binary for testing
16	$(CC) $(CFLAGS) prompt.c -o prompt $(LDFLAGS)	16	$(CC) $(CFLAGS) prompt.c vectordb.c -o prompt $(LDFLAGS)
17		17
18	llamacpp: .assure # Build llama.cpp libraries	18	llamacpp: .assure # Build llama.cpp libraries
19	mkdir $(LLAMA_DIR)/build && \	19	mkdir $(LLAMA_DIR)/build && \
...
27		27
28	docker: .assure # Runs prompt in Docker container	28	docker: .assure # Runs prompt in Docker container
29	docker build -t promptd .	29	docker build -t promptd .
30	docker run -it promptd bash	30	docker run -it promptd
31		31
32	clean: # Cleans up all the build artefacts	32	clean: # Cleans up all the build artefacts
33	-rm -f prompt	33	-rm -f prompt
...


Gandalf is a wizard in The Lord of the Rings with a grey beard and a staff.
Gandalf is one of the Istari and is called the Grey Pilgrim and Mithrandir.
Gandalf fought Sauron and helped destroy the One Ring.
Frodo Baggins is a hobbit in The Lord of the Rings and is Bilbo's nephew.
Frodo is from the Shire and carried the One Ring to Mount Doom.
Frodo is a member of the Fellowship of the Ring.
Samwise Gamgee is a hobbit from the Shire in The Lord of the Rings.
Samwise is Frodo's loyal companion and a member of the Fellowship of the Ring.
Aragorn is a man in The Lord of the Rings and is known as Strider.
Aragorn is a ranger, a leader of Men, and a member of the Fellowship of the Ring.
Legolas is an elf in The Lord of the Rings and a skilled archer.
Legolas is a member of the Fellowship of the Ring.
Gimli is a dwarf in The Lord of the Rings and a warrior.
Gimli is a member of the Fellowship of the Ring.
Boromir is a man from Gondor in The Lord of the Rings.
Boromir is a member of the Fellowship of the Ring.
The One Ring is a powerful ring in The Lord of the Rings that was created by Sauron.
The One Ring corrupts its bearer and must be destroyed in Mount Doom.
Sauron is the Dark Lord in The Lord of the Rings and created the One Ring.
Sauron is an enemy of the free peoples of Middle-earth.
Mordor is the realm of Sauron in The Lord of the Rings and contains Mount Doom.
Mount Doom is a volcano in Mordor in The Lord of the Rings where the One Ring was destroyed.
The Shire is the homeland of hobbits in The Lord of the Rings and the home of Frodo and Samwise.
Gondor is a kingdom of Men in The Lord of the Rings and the home of Boromir.


#include "llama.h"
#include "vectordb.h"
#include "models.h"
  
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <ctype.h>
  
#define MAX_TOKENS 512
#define MAX_TOKEN_LEN 32
  
static const char *refusal_text = "I don't have that information.";
  
static void llama_log_callback(enum ggml_log_level level, const char *text, void *user_data) {
    (void)level;
    (void)user_data;
    (void)text;
}
  
static int is_stopword(const char *token, size_t len) {
    static const char *stopwords[] = {
        "a", "an", "the", "is", "are", "was", "were", "of", "to", "in", "on",
        "for", "with", "and", "or", "not", "if", "then", "else", "from", "by",
        "as", "at", "it", "its", "this", "that", "these", "those", "who", "what",
        "when", "where", "why", "how", "which", "about", "into", "over", "under",
        "be", "been", "being", "do", "does", "did", "but", "so", "than"
  
  
  
  
    };
    for (size_t i = 0; i < sizeof(stopwords) / sizeof(stopwords[0]); i++) {
        if (strlen(stopwords[i]) == len && strncmp(stopwords[i], token, len) == 0) {
            return 1;
        }
    }
    return 0;
}
  
static int token_exists(char tokens[MAX_TOKENS][MAX_TOKEN_LEN], int count, const char *token) {
    for (int i = 0; i < count; i++) {
        if (strcmp(tokens[i], token) == 0) {
            return 1;
        }
    }
    return 0;
}
  
static int collect_tokens(const char *text, char tokens[MAX_TOKENS][MAX_TOKEN_LEN]) {
    int count = 0;
    char buf[MAX_TOKEN_LEN];
    int len = 0;
    for (const unsigned char *p = (const unsigned char *)text; ; p++) {
        if (isalnum(*p)) {
            if (len < MAX_TOKEN_LEN - 1) {
                buf[len++] = (char)tolower(*p);
            }
        } else {
            if (len > 0) {
                buf[len] = '\0';
                if (len >= 4 && !is_stopword(buf, (size_t)len)) {
                    if (!token_exists(tokens, count, buf) && count < MAX_TOKENS) {
                        strncpy(tokens[count], buf, MAX_TOKEN_LEN - 1);
                        tokens[count][MAX_TOKEN_LEN - 1] = '\0';
                        count++;
                    }
                }
                len = 0;
            }
            if (*p == '\0') {
                break;
            }
  
  
  
  
  
        }
    }
    return count;
}
  
static int has_overlap(const char *a, const char *b) {
    if (a == NULL || b == NULL) {
        return 0;
    }
    char tokens[MAX_TOKENS][MAX_TOKEN_LEN];
    int token_count = collect_tokens(b, tokens);
    if (token_count == 0) {
        return 0;
    }
    char buf[MAX_TOKEN_LEN];
    int len = 0;
    for (const unsigned char *p = (const unsigned char *)a; ; p++) {
        if (isalnum(*p)) {
            if (len < MAX_TOKEN_LEN - 1) {
                buf[len++] = (char)tolower(*p);
            }
        } else {
            if (len > 0) {
                buf[len] = '\0';
                if (len >= 4 && !is_stopword(buf, (size_t)len)) {
                    if (token_exists(tokens, token_count, buf)) {
                        return 1;
                    }
                }
                len = 0;
            }
            if (*p == '\0') {
                break;
            }
        }
    }
    return 0;
}
  
static int execute_prompt(const char *model_name, const char *prompt, const char *context, int n_predict) {
    const model_config *cfg = NULL;
    if (model_name != NULL) {
        cfg = get_model_by_name(model_name);

        cfg = &models[0];
    }
  
    if (!has_overlap(prompt, context)) {
        printf("------------ Prompt: %s\n", prompt);
        printf("------------ Response: %s\n", refusal_text);
        return 0;
    }
  
    ggml_backend_load_all();
  
    struct llama_model_params model_params = llama_model_default_params();

  
    const struct llama_vocab *vocab = llama_model_get_vocab(model);
  
    const char *system_prefix = "System: Answer using only the Context. If the answer is not explicitly stated in Context, respond exactly: I don't have that information.\n\n";
    const char *context_prefix = "Context:\n";
    const char *prompt_prefix = "\n\nQuestion:\n";
    const char *answer_prefix = "\n\nAnswer:\n";
    size_t context_len = context ? strlen(context) : 0;
    size_t prompt_len = strlen(prompt);
    size_t full_len = strlen(system_prefix) + strlen(context_prefix) + context_len + strlen(prompt_prefix) + prompt_len + strlen(answer_prefix) + 1;
    char *full_prompt = (char *)malloc(full_len);
    if (full_prompt == NULL) {
        fprintf(stderr, "Error: failed to allocate prompt buffer\n");
        llama_model_free(model);
        return 1;
    }
    snprintf(full_prompt, full_len, "%s%s%s%s%s", system_prefix, context_prefix, context ? context : "", prompt_prefix, prompt);
    strncat(full_prompt, answer_prefix, full_len - strlen(full_prompt) - 1);
  
    int n_prompt = -llama_tokenize(vocab, full_prompt, strlen(full_prompt), NULL, 0, true, true);
    llama_token *prompt_tokens = (llama_token *)malloc(n_prompt * sizeof(llama_token));
    if (llama_tokenize(vocab, full_prompt, strlen(full_prompt), prompt_tokens, n_prompt, true, true) < 0) {
        fprintf(stderr, "Error: failed to tokenize the prompt\n");
        free(full_prompt);
        free(prompt_tokens);
        llama_model_free(model);
        return 1;

    struct llama_context *ctx = llama_init_from_model(model, ctx_params);
    if (ctx == NULL) {
        fprintf(stderr, "Error: failed to create the llama_context\n");
        free(full_prompt);
        free(prompt_tokens);
        llama_model_free(model);
        return 1;

    llama_sampler_chain_add(smpl, llama_sampler_init_dist(cfg->seed));
  
    struct llama_batch batch = llama_batch_get_one(prompt_tokens, n_prompt);
  
    if (llama_model_has_encoder(model)) {
        if (llama_encode(ctx, batch)) {
            fprintf(stderr, "Error: failed to encode prompt\n");
            llama_sampler_free(smpl);
            free(full_prompt);
            free(prompt_tokens);
            llama_free(ctx);
            llama_model_free(model);
            return 1;
        }
  

        batch = llama_batch_get_one(&decoder_start, 1);
    }
  
    printf("------------ Prompt: %s\n", prompt);
    printf("------------ Response: ");
    fflush(stdout);
  
    int n_pos = 0;
    llama_token new_token_id;
    size_t out_cap = 256;
    size_t out_len = 0;
    char *out = (char *)malloc(out_cap);
    if (out == NULL) {
        fprintf(stderr, "Error: failed to allocate output buffer\n");
        free(full_prompt);
        free(prompt_tokens);
        llama_sampler_free(smpl);
        llama_free(ctx);
        llama_model_free(model);
        return 1;
    }
    out[0] = '\0';
  
    while (n_pos + batch.n_tokens < n_prompt + n_predict) {
        if (llama_decode(ctx, batch)) {

            fprintf(stderr, "Error: failed to convert token to piece\n");
            break;
        }
        int stop_at = n;
        for (int i = 0; i < n; i++) {
            if (buf[i] == '\n') {
                stop_at = i;
                break;
            }
        }
        if (out_len + (size_t)stop_at + 1 > out_cap) {
            while (out_len + (size_t)stop_at + 1 > out_cap) {
                out_cap *= 2;
            }
            char *next = (char *)realloc(out, out_cap);
            if (next == NULL) {
                fprintf(stderr, "Error: failed to grow output buffer\n");
                break;
            }
            out = next;
        }
        memcpy(out + out_len, buf, (size_t)stop_at);
        out_len += (size_t)stop_at;
        out[out_len] = '\0';
  
        if (stop_at != n) {
            break;
        }
  
        batch = llama_batch_get_one(&new_token_id, 1);
    }
  
    if (!has_overlap(out, context)) {
        strcpy(out, refusal_text);
        out_len = strlen(out);
    }
  
    printf("%s\n", out);
  
    free(full_prompt);
    free(prompt_tokens);
    free(out);
    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_model_free(model);
  
    return 0;
}
  
static char *generate_context(const char *model_name, const char *context_file, const char *prompt) {
    FILE *context_fp = fopen(context_file, "r");
    if (context_fp == NULL) {
        fprintf(stderr, "Error: unable to open context file %s\n", context_file);
        return NULL;
    }
  
    llama_backend_init();
  
    const model_config *cfg = NULL;
    if (model_name != NULL) {
        cfg = get_model_by_name(model_name);
        if (cfg == NULL) {
            fprintf(stderr, "Error: unknown model '%s'\n", model_name);
            fclose(context_fp);
            llama_backend_free();
            return NULL;
        }
    } else {
        cfg = &models[0];
    }
  
    /* struct llama_model *model = llama_load_model_from_file(cfg->filepath, llama_model_default_params()); */
    struct llama_model *model = llama_model_load_from_file(cfg->filepath, llama_model_default_params());
    if (model == NULL) {
        fprintf(stderr, "Error: unable to load embedding model\n");
        fclose(context_fp);
        llama_backend_free();
        return NULL;
    }
  
    struct llama_context_params cparams = llama_context_default_params();
    cparams.embeddings = true;
  
    /* struct llama_context *embed_ctx = llama_new_context_with_model(model, cparams); */
    struct llama_context *embed_ctx = llama_init_from_model(model, cparams);
    if (embed_ctx == NULL) {
        fprintf(stderr, "Error: failed to create embedding context\n");
        llama_model_free(model);
        fclose(context_fp);
        llama_backend_free();
        return NULL;
    }
  
    VectorDB db;
    vdb_init(&db, embed_ctx);
  
    char line[1024];
    while (fgets(line, sizeof(line), context_fp) != NULL) {
        size_t len = strlen(line);
        while (len > 0 && (line[len - 1] == '\n' || line[len - 1] == '\r')) {
            line[len - 1] = '\0';
            len--;
        }
        if (len == 0) {
            continue;
        }
        vdb_add_document(&db, line);
    }
  
    float query[VDB_EMBED_SIZE];
    int results[3];
  
    vdb_embed_query(&db, prompt, query);
    vdb_search(&db, query, 3, results);
  
    size_t context_cap = 1024;
    size_t context_len = 0;
    char *context = (char *)malloc(context_cap);
    if (context == NULL) {
        fprintf(stderr, "Error: failed to allocate context buffer\n");
        fclose(context_fp);
        llama_free(embed_ctx);
        llama_model_free(model);
        llama_backend_free();
        return NULL;
    }
    context[0] = '\0';
  
    for (int i = 0; i < 3; i++) {
        if (results[i] < 0) {
            continue;
        }
        const char *text = db.docs[results[i]].text;
        size_t text_len = strlen(text);
        size_t need = context_len + text_len + 2;
        if (need > context_cap) {
            while (need > context_cap) {
                context_cap *= 2;
            }
            char *next = (char *)realloc(context, context_cap);
            if (next == NULL) {
                fprintf(stderr, "Error: failed to grow context buffer\n");
                free(context);
                fclose(context_fp);
                llama_free(embed_ctx);
                llama_model_free(model);
                llama_backend_free();
                return NULL;
            }
            context = next;
        }
        memcpy(context + context_len, text, text_len);
        context_len += text_len;
        context[context_len++] = '\n';
        context[context_len] = '\0';
    }
  
    fclose(context_fp);
    llama_free(embed_ctx);
    llama_model_free(model);
    llama_backend_free();
  
    return context;
}
  
static void show_help(const char *prog) {
    printf("Usage: %s [OPTIONS]\n", prog);
    printf("Options:\n");
    printf("  -m, --model <name>    Specify model to use (default: first model)\n");
    printf("  -p, --prompt <text>   Specify prompt text (default: \"What is 2+2?\")\n");
    printf("  -c, --context <text>  Specify context file\n");
    printf("  -v, --verbose         Enable verbose logging\n");
    printf("  -h, --help            Show this help message\n");
}
  
int main(int argc, char **argv) {
    const char *model_name = NULL;
    const char *prompt = NULL;
    const char *context_file = NULL;
	int verbose = 0;
    
    int n_predict = 64;
  
    static struct option long_options[] = {
        {"model", required_argument, 0, 'm'},
        {"prompt", required_argument, 0, 'p'},
        {"context", required_argument, 0, 'c'},
        {"verbose", no_argument, 0, 'v'},
        {"help", no_argument, 0, 'h'},
        {0, 0, 0, 0}
    };
  
    int opt;
    int option_index = 0;
    while ((opt = getopt_long(argc, argv, "m:p:c:vh", long_options, &option_index)) != -1) {
        switch (opt) {
            case 'm':
                model_name = optarg;
                break;
            case 'p':
                prompt = optarg;
                break;
            case 'c':
                context_file = optarg;
                break;
            case 'v':
                verbose = 1;
                break;
            case 'h':
                show_help(argv[0]);
                return 0;
            default:
                fprintf(stderr, "Usage: %s [-m model] [-p prompt] [-h]\n", argv[0]);
                return 1;
        }
    }
  
	if (verbose == 0) {
		llama_log_set(llama_log_callback, NULL);
	}
  
    if (prompt == NULL) {
		printf("Prompt must be provided. Exiting...");
		return 1;
    }
  
    if (context_file == NULL) {
		printf("Context file must be provided. Exiting...");
		return 1;
    }
  
    char *context = generate_context(model_name, context_file, prompt);
    if (context == NULL) {
        return 1;
    }
  
    int rc = execute_prompt(model_name, prompt, context, n_predict);
    free(context);
    return rc;
}

diff --git a/vectordb.c b/vectordb.c
	1	#include <stdio.h>
	2	#include <string.h>
	3	#include <math.h>
	4
	5	#include "llama.h"
	6	#include "vectordb.h"
	7
	8	static float cosine_similarity(float a, float b, int n) {
	9	float dot = 0, normA = 0, normB = 0;
	10	for (int i = 0; i < n; i++) {
	11	dot += a[i] * b[i];
	12	normA += a[i] * a[i];
	13	normB += b[i] * b[i];
	14	}
	15	return dot / (sqrtf(normA) * sqrtf(normB) + 1e-8f);
	16	}
	17
	18	static void embed_text(struct llama_context ctx, const char text, float *out) {
	19	llama_token tokens[512];
	20	const struct llama_model *model = llama_get_model(ctx);
	21	const struct llama_vocab *vocab = llama_model_get_vocab(model);
	22	int n_tokens = llama_tokenize(
	23	vocab,
	24	text,
	25	strlen(text),
	26	tokens,
	27	512,
	28	true,
	29	true
	30	);
	31	if (n_tokens < 0) {
	32	return;
	33	}
	34
	35	struct llama_batch batch = llama_batch_get_one(tokens, n_tokens);
	36	llama_decode(ctx, batch);
	37
	38	const float *emb = llama_get_embeddings(ctx);
	39	memcpy(out, emb, sizeof(float) * VDB_EMBED_SIZE);
	40
	41	}
	42
	43	void vdb_init(VectorDB db, struct llama_context embed_ctx) {
	44	memset(db, 0, sizeof(VectorDB));
	45	db->embed_ctx = embed_ctx;
	46	}
	47
	48	void vdb_free(VectorDB *db) {
	49	(void)db; // nothing yet (future persistence etc.)
	50	}
	51
	52	void vdb_add_document(VectorDB db, const char text) {
	53	if (db->count >= VDB_MAX_DOCS) {
	54	printf("VectorDB full!\n");
	55	return;
	56	}
	57
	58	VectorDoc *doc = &db->docs[db->count++];
	59	strncpy(doc->text, text, VDB_MAX_TEXT - 1);
	60	doc->text[VDB_MAX_TEXT - 1] = 0;
	61
	62	printf("Embedding doc %d...\n", db->count);
	63	embed_text(db->embed_ctx, text, doc->embedding);
	64	}
	65
	66	void vdb_embed_query(VectorDB db, const char text, float *out_embedding) {
	67	embed_text(db->embed_ctx, text, out_embedding);
	68	}
	69
	70	void vdb_search(VectorDB db, float query, int top_k, int *results) {
	71	float best_scores[top_k];
	72	for (int i = 0; i < top_k; i++) {
	73	best_scores[i] = -1.0f;
	74	results[i] = -1;
	75	}
	76
	77	for (int i = 0; i < db->count; i++) {
	78	float score = cosine_similarity(query, db->docs[i].embedding, VDB_EMBED_SIZE);
	79
	80	for (int j = 0; j < top_k; j++) {
	81	if (score > best_scores[j]) {
	82	for (int k = top_k - 1; k > j; k--) {
	83	best_scores[k] = best_scores[k - 1];
	84	results[k] = results[k - 1];
	85	}
	86	best_scores[j] = score;
	87	results[j] = i;
	88	break;
	89	}
	90	}
	91	}
	92	}

diff --git a/vectordb.h b/vectordb.h
	1	#ifndef VECTORDB_H
	2	#define VECTORDB_H
	3
	4	#include "llama.h"
	5
	6	#define VDB_MAX_DOCS 1000
	7	#define VDB_EMBED_SIZE 768
	8	#define VDB_MAX_TEXT 1024
	9
	10	typedef struct {
	11	float embedding[VDB_EMBED_SIZE];
	12	char text[VDB_MAX_TEXT];
	13	} VectorDoc;
	14
	15	typedef struct {
	16	VectorDoc docs[VDB_MAX_DOCS];
	17	int count;
	18	struct llama_context *embed_ctx;
	19	} VectorDB;
	20
	21	void vdb_init(VectorDB db, struct llama_context embed_ctx);
	22	void vdb_free(VectorDB *db);
	23
	24	void vdb_add_document(VectorDB db, const char text);
	25
	26	void vdb_embed_query(VectorDB db, const char text, float *out_embedding);
	27	void vdb_search(VectorDB db, float query_embedding, int top_k, int *results);
	28
	29	#endif

diff --git a/context.txt b/context.txt
1	Gandalf: wizard, Lord of the Rings, grey beard, staff, Istari, Grey Pilgrim, Mithrandir, fought Sauron, helped destroy One Ring.	1	Gandalf is a wizard in The Lord of the Rings with a grey beard and a staff.
2		2	Gandalf is one of the Istari and is called the Grey Pilgrim and Mithrandir.
3	Frodo: hobbit, Lord of the Rings, Bilbo's nephew, Shire, carried One Ring to Mount Doom, Fellowship of the Ring.	3	Gandalf fought Sauron and helped destroy the One Ring.
4		4	Frodo Baggins is a hobbit in The Lord of the Rings and is Bilbo's nephew.
5	Example: Who is Gandalf? Gandalf is a wizard from The Lord of the Rings.	5	Frodo is from the Shire and carried the One Ring to Mount Doom.
6	Example: Who is Frodo? Frodo is a hobbit from The Lord of the Rings.	6	Frodo is a member of the Fellowship of the Ring.
7	Example: Who is Harry Potter? I don't have that information.	7	Samwise Gamgee is a hobbit from the Shire in The Lord of the Rings.
8		8	Samwise is Frodo's loyal companion and a member of the Fellowship of the Ring.
9	Answer this question. Use only the facts from above. If unknown, say "I don't have that information." Just give the answer, no prefix:	9	Aragorn is a man in The Lord of the Rings and is known as Strider.
		10	Aragorn is a ranger, a leader of Men, and a member of the Fellowship of the Ring.
		11	Legolas is an elf in The Lord of the Rings and a skilled archer.
		12	Legolas is a member of the Fellowship of the Ring.
		13	Gimli is a dwarf in The Lord of the Rings and a warrior.
		14	Gimli is a member of the Fellowship of the Ring.
		15	Boromir is a man from Gondor in The Lord of the Rings.
		16	Boromir is a member of the Fellowship of the Ring.
		17	The One Ring is a powerful ring in The Lord of the Rings that was created by Sauron.
		18	The One Ring corrupts its bearer and must be destroyed in Mount Doom.
		19	Sauron is the Dark Lord in The Lord of the Rings and created the One Ring.
		20	Sauron is an enemy of the free peoples of Middle-earth.
		21	Mordor is the realm of Sauron in The Lord of the Rings and contains Mount Doom.
		22	Mount Doom is a volcano in Mordor in The Lord of the Rings where the One Ring was destroyed.
		23	The Shire is the homeland of hobbits in The Lord of the Rings and the home of Frodo and Samwise.
		24	Gondor is a kingdom of Men in The Lord of the Rings and the home of Boromir.

diff --git a/prompt.c b/prompt.c
1	#include "llama.h"	1	#include "llama.h"
		2	#include "vectordb.h"
2	#include "models.h"	3	#include "models.h"
		4
3	#include <stdio.h>	5	#include <stdio.h>
4	#include <stdlib.h>	6	#include <stdlib.h>
5	#include <string.h>	7	#include <string.h>
6	#include <getopt.h>	8	#include <getopt.h>
		9	#include <ctype.h>
7		10
8	static void show_help(const char *prog) {	11	#define MAX_TOKENS 512
9	printf("Usage: %s [OPTIONS]\n", prog);	12	#define MAX_TOKEN_LEN 32
10	printf("Options:\n");	13
11	printf(" -m, --model <name> Specify model to use (default: first model)\n");	14	static const char *refusal_text = "I don't have that information.";
12	printf(" -p, --prompt <text> Specify prompt text (default: \"What is 2+2?\")\n");	15
13	printf(" -h, --help Show this help message\n");	16	static void llama_log_callback(enum ggml_log_level level, const char text, void user_data) {
		17	(void)level;
		18	(void)user_data;
		19	(void)text;
14	}	20	}
15		21
16	int main(int argc, char **argv) {	22	static int is_stopword(const char *token, size_t len) {
17	const char *model_name = NULL;	23	static const char *stopwords[] = {
18	const char *prompt = NULL;	24	"a", "an", "the", "is", "are", "was", "were", "of", "to", "in", "on",
19		25	"for", "with", "and", "or", "not", "if", "then", "else", "from", "by",
20	int n_predict = 64;	26	"as", "at", "it", "its", "this", "that", "these", "those", "who", "what",
21		27	"when", "where", "why", "how", "which", "about", "into", "over", "under",
22	static struct option long_options[] = {	28	"be", "been", "being", "do", "does", "did", "but", "so", "than"
23	{"model", required_argument, 0, 'm'},
24	{"prompt", required_argument, 0, 'p'},
25	{"help", no_argument, 0, 'h'},
26	{0, 0, 0, 0}
27	};	29	};
		30	for (size_t i = 0; i < sizeof(stopwords) / sizeof(stopwords[0]); i++) {
		31	if (strlen(stopwords[i]) == len && strncmp(stopwords[i], token, len) == 0) {
		32	return 1;
		33	}
		34	}
		35	return 0;
		36	}
28		37
29	int opt;	38	static int token_exists(char tokens[MAX_TOKENS][MAX_TOKEN_LEN], int count, const char *token) {
30	int option_index = 0;	39	for (int i = 0; i < count; i++) {
31	while ((opt = getopt_long(argc, argv, "m:p:h", long_options, &option_index)) != -1) {	40	if (strcmp(tokens[i], token) == 0) {
32	switch (opt) {	41	return 1;
33	case 'm':	42	}
34	model_name = optarg;	43	}
35	break;	44	return 0;
36	case 'p':	45	}
37	prompt = optarg;	46
		47	static int collect_tokens(const char *text, char tokens[MAX_TOKENS][MAX_TOKEN_LEN]) {
		48	int count = 0;
		49	char buf[MAX_TOKEN_LEN];
		50	int len = 0;
		51	for (const unsigned char p = (const unsigned char )text; ; p++) {
		52	if (isalnum(*p)) {
		53	if (len < MAX_TOKEN_LEN - 1) {
		54	buf[len++] = (char)tolower(*p);
		55	}
		56	} else {
		57	if (len > 0) {
		58	buf[len] = '\0';
		59	if (len >= 4 && !is_stopword(buf, (size_t)len)) {
		60	if (!token_exists(tokens, count, buf) && count < MAX_TOKENS) {
		61	strncpy(tokens[count], buf, MAX_TOKEN_LEN - 1);
		62	tokens[count][MAX_TOKEN_LEN - 1] = '\0';
		63	count++;
		64	}
		65	}
		66	len = 0;
		67	}
		68	if (*p == '\0') {
38	break;	69	break;
39	case 'h':	70	}
40	show_help(argv[0]);
41	return 0;
42	default:
43	fprintf(stderr, "Usage: %s [-m model] [-p prompt] [-h]\n", argv[0]);
44	return 1;
45	}	71	}
46	}	72	}
		73	return count;
		74	}
47		75
48	if (prompt == NULL) {	76	static int has_overlap(const char a, const char b) {
49	printf("Prompt must be provided. Exiting...");	77	if (a == NULL \|\| b == NULL) {
50	return 1;	78	return 0;
51	}	79	}
		80	char tokens[MAX_TOKENS][MAX_TOKEN_LEN];
		81	int token_count = collect_tokens(b, tokens);
		82	if (token_count == 0) {
		83	return 0;
		84	}
		85	char buf[MAX_TOKEN_LEN];
		86	int len = 0;
		87	for (const unsigned char p = (const unsigned char )a; ; p++) {
		88	if (isalnum(*p)) {
		89	if (len < MAX_TOKEN_LEN - 1) {
		90	buf[len++] = (char)tolower(*p);
		91	}
		92	} else {
		93	if (len > 0) {
		94	buf[len] = '\0';
		95	if (len >= 4 && !is_stopword(buf, (size_t)len)) {
		96	if (token_exists(tokens, token_count, buf)) {
		97	return 1;
		98	}
		99	}
		100	len = 0;
		101	}
		102	if (*p == '\0') {
		103	break;
		104	}
		105	}
		106	}
		107	return 0;
		108	}
52		109
		110	static int execute_prompt(const char model_name, const char prompt, const char *context, int n_predict) {
53	const model_config *cfg = NULL;	111	const model_config *cfg = NULL;
54	if (model_name != NULL) {	112	if (model_name != NULL) {
55	cfg = get_model_by_name(model_name);	113	cfg = get_model_by_name(model_name);
...
61	cfg = &models[0];	119	cfg = &models[0];
62	}	120	}
63		121
		122	if (!has_overlap(prompt, context)) {
		123	printf("------------ Prompt: %s\n", prompt);
		124	printf("------------ Response: %s\n", refusal_text);
		125	return 0;
		126	}
		127
64	ggml_backend_load_all();	128	ggml_backend_load_all();
65		129
66	struct llama_model_params model_params = llama_model_default_params();	130	struct llama_model_params model_params = llama_model_default_params();
...
75		139
76	const struct llama_vocab *vocab = llama_model_get_vocab(model);	140	const struct llama_vocab *vocab = llama_model_get_vocab(model);
77		141
78	int n_prompt = -llama_tokenize(vocab, prompt, strlen(prompt), NULL, 0, true, true);	142	const char *system_prefix = "System: Answer using only the Context. If the answer is not explicitly stated in Context, respond exactly: I don't have that information.\n\n";
		143	const char *context_prefix = "Context:\n";
		144	const char *prompt_prefix = "\n\nQuestion:\n";
		145	const char *answer_prefix = "\n\nAnswer:\n";
		146	size_t context_len = context ? strlen(context) : 0;
		147	size_t prompt_len = strlen(prompt);
		148	size_t full_len = strlen(system_prefix) + strlen(context_prefix) + context_len + strlen(prompt_prefix) + prompt_len + strlen(answer_prefix) + 1;
		149	char full_prompt = (char )malloc(full_len);
		150	if (full_prompt == NULL) {
		151	fprintf(stderr, "Error: failed to allocate prompt buffer\n");
		152	llama_model_free(model);
		153	return 1;
		154	}
		155	snprintf(full_prompt, full_len, "%s%s%s%s%s", system_prefix, context_prefix, context ? context : "", prompt_prefix, prompt);
		156	strncat(full_prompt, answer_prefix, full_len - strlen(full_prompt) - 1);
		157
		158	int n_prompt = -llama_tokenize(vocab, full_prompt, strlen(full_prompt), NULL, 0, true, true);
79	llama_token prompt_tokens = (llama_token )malloc(n_prompt * sizeof(llama_token));	159	llama_token prompt_tokens = (llama_token )malloc(n_prompt * sizeof(llama_token));
80	if (llama_tokenize(vocab, prompt, strlen(prompt), prompt_tokens, n_prompt, true, true) < 0) {	160	if (llama_tokenize(vocab, full_prompt, strlen(full_prompt), prompt_tokens, n_prompt, true, true) < 0) {
81	fprintf(stderr, "Error: failed to tokenize the prompt\n");	161	fprintf(stderr, "Error: failed to tokenize the prompt\n");
		162	free(full_prompt);
82	free(prompt_tokens);	163	free(prompt_tokens);
83	llama_model_free(model);	164	llama_model_free(model);
84	return 1;	165	return 1;
...
92	struct llama_context *ctx = llama_init_from_model(model, ctx_params);	173	struct llama_context *ctx = llama_init_from_model(model, ctx_params);
93	if (ctx == NULL) {	174	if (ctx == NULL) {
94	fprintf(stderr, "Error: failed to create the llama_context\n");	175	fprintf(stderr, "Error: failed to create the llama_context\n");
		176	free(full_prompt);
95	free(prompt_tokens);	177	free(prompt_tokens);
96	llama_model_free(model);	178	llama_model_free(model);
97	return 1;	179	return 1;
...
104	llama_sampler_chain_add(smpl, llama_sampler_init_dist(cfg->seed));	186	llama_sampler_chain_add(smpl, llama_sampler_init_dist(cfg->seed));
105		187
106	struct llama_batch batch = llama_batch_get_one(prompt_tokens, n_prompt);	188	struct llama_batch batch = llama_batch_get_one(prompt_tokens, n_prompt);
107		189
108	if (llama_model_has_encoder(model)) {	190	if (llama_model_has_encoder(model)) {
109	if (llama_encode(ctx, batch)) {	191	if (llama_encode(ctx, batch)) {
110	fprintf(stderr, "Error: failed to encode prompt\n");	192	fprintf(stderr, "Error: failed to encode prompt\n");
		193	llama_sampler_free(smpl);
		194	free(full_prompt);
		195	free(prompt_tokens);
		196	llama_free(ctx);
		197	llama_model_free(model);
111	return 1;	198	return 1;
112	}	199	}
113		200
...
118	batch = llama_batch_get_one(&decoder_start, 1);	205	batch = llama_batch_get_one(&decoder_start, 1);
119	}	206	}
120		207
121	printf("Prompt: %s\n", prompt);	208	printf("------------ Prompt: %s\n", prompt);
122	printf("Response: ");	209	printf("------------ Response: ");
123	fflush(stdout);	210	fflush(stdout);
124		211
125	int n_pos = 0;	212	int n_pos = 0;
126	llama_token new_token_id;	213	llama_token new_token_id;
		214	size_t out_cap = 256;
		215	size_t out_len = 0;
		216	char out = (char )malloc(out_cap);
		217	if (out == NULL) {
		218	fprintf(stderr, "Error: failed to allocate output buffer\n");
		219	free(full_prompt);
		220	free(prompt_tokens);
		221	llama_sampler_free(smpl);
		222	llama_free(ctx);
		223	llama_model_free(model);
		224	return 1;
		225	}
		226	out[0] = '\0';
127		227
128	while (n_pos + batch.n_tokens < n_prompt + n_predict) {	228	while (n_pos + batch.n_tokens < n_prompt + n_predict) {
129	if (llama_decode(ctx, batch)) {	229	if (llama_decode(ctx, batch)) {
...
145	fprintf(stderr, "Error: failed to convert token to piece\n");	245	fprintf(stderr, "Error: failed to convert token to piece\n");
146	break;	246	break;
147	}	247	}
148	printf("%.*s", n, buf);	248	int stop_at = n;
149	fflush(stdout);	249	for (int i = 0; i < n; i++) {
		250	if (buf[i] == '\n') {
		251	stop_at = i;
		252	break;
		253	}
		254	}
		255	if (out_len + (size_t)stop_at + 1 > out_cap) {
		256	while (out_len + (size_t)stop_at + 1 > out_cap) {
		257	out_cap *= 2;
		258	}
		259	char next = (char )realloc(out, out_cap);
		260	if (next == NULL) {
		261	fprintf(stderr, "Error: failed to grow output buffer\n");
		262	break;
		263	}
		264	out = next;
		265	}
		266	memcpy(out + out_len, buf, (size_t)stop_at);
		267	out_len += (size_t)stop_at;
		268	out[out_len] = '\0';
		269
		270	if (stop_at != n) {
		271	break;
		272	}
150		273
151	batch = llama_batch_get_one(&new_token_id, 1);	274	batch = llama_batch_get_one(&new_token_id, 1);
152	}	275	}
153		276
154	printf("\n");	277	if (!has_overlap(out, context)) {
		278	strcpy(out, refusal_text);
		279	out_len = strlen(out);
		280	}
		281
		282	printf("%s\n", out);
155		283
		284	free(full_prompt);
156	free(prompt_tokens);	285	free(prompt_tokens);
		286	free(out);
157	llama_sampler_free(smpl);	287	llama_sampler_free(smpl);
158	llama_free(ctx);	288	llama_free(ctx);
159	llama_model_free(model);	289	llama_model_free(model);
160		290
161	return 0;	291	return 0;
162	}	292	}
		293
		294	static char generate_context(const char model_name, const char context_file, const char prompt) {
		295	FILE *context_fp = fopen(context_file, "r");
		296	if (context_fp == NULL) {
		297	fprintf(stderr, "Error: unable to open context file %s\n", context_file);
		298	return NULL;
		299	}
		300
		301	llama_backend_init();
		302
		303	const model_config *cfg = NULL;
		304	if (model_name != NULL) {
		305	cfg = get_model_by_name(model_name);
		306	if (cfg == NULL) {
		307	fprintf(stderr, "Error: unknown model '%s'\n", model_name);
		308	fclose(context_fp);
		309	llama_backend_free();
		310	return NULL;
		311	}
		312	} else {
		313	cfg = &models[0];
		314	}
		315
		316	/* struct llama_model model = llama_load_model_from_file(cfg->filepath, llama_model_default_params()); /
		317	struct llama_model *model = llama_model_load_from_file(cfg->filepath, llama_model_default_params());
		318	if (model == NULL) {
		319	fprintf(stderr, "Error: unable to load embedding model\n");
		320	fclose(context_fp);
		321	llama_backend_free();
		322	return NULL;
		323	}
		324
		325	struct llama_context_params cparams = llama_context_default_params();
		326	cparams.embeddings = true;
		327
		328	/* struct llama_context embed_ctx = llama_new_context_with_model(model, cparams); /
		329	struct llama_context *embed_ctx = llama_init_from_model(model, cparams);
		330	if (embed_ctx == NULL) {
		331	fprintf(stderr, "Error: failed to create embedding context\n");
		332	llama_model_free(model);
		333	fclose(context_fp);
		334	llama_backend_free();
		335	return NULL;
		336	}
		337
		338	VectorDB db;
		339	vdb_init(&db, embed_ctx);
		340
		341	char line[1024];
		342	while (fgets(line, sizeof(line), context_fp) != NULL) {
		343	size_t len = strlen(line);
		344	while (len > 0 && (line[len - 1] == '\n' \|\| line[len - 1] == '\r')) {
		345	line[len - 1] = '\0';
		346	len--;
		347	}
		348	if (len == 0) {
		349	continue;
		350	}
		351	vdb_add_document(&db, line);
		352	}
		353
		354	float query[VDB_EMBED_SIZE];
		355	int results[3];
		356
		357	vdb_embed_query(&db, prompt, query);
		358	vdb_search(&db, query, 3, results);
		359
		360	size_t context_cap = 1024;
		361	size_t context_len = 0;
		362	char context = (char )malloc(context_cap);
		363	if (context == NULL) {
		364	fprintf(stderr, "Error: failed to allocate context buffer\n");
		365	fclose(context_fp);
		366	llama_free(embed_ctx);
		367	llama_model_free(model);
		368	llama_backend_free();
		369	return NULL;
		370	}
		371	context[0] = '\0';
		372
		373	for (int i = 0; i < 3; i++) {
		374	if (results[i] < 0) {
		375	continue;
		376	}
		377	const char *text = db.docs[results[i]].text;
		378	size_t text_len = strlen(text);
		379	size_t need = context_len + text_len + 2;
		380	if (need > context_cap) {
		381	while (need > context_cap) {
		382	context_cap *= 2;
		383	}
		384	char next = (char )realloc(context, context_cap);
		385	if (next == NULL) {
		386	fprintf(stderr, "Error: failed to grow context buffer\n");
		387	free(context);
		388	fclose(context_fp);
		389	llama_free(embed_ctx);
		390	llama_model_free(model);
		391	llama_backend_free();
		392	return NULL;
		393	}
		394	context = next;
		395	}
		396	memcpy(context + context_len, text, text_len);
		397	context_len += text_len;
		398	context[context_len++] = '\n';
		399	context[context_len] = '\0';
		400	}
		401
		402	fclose(context_fp);
		403	llama_free(embed_ctx);
		404	llama_model_free(model);
		405	llama_backend_free();
		406
		407	return context;
		408	}
		409
		410	static void show_help(const char *prog) {
		411	printf("Usage: %s [OPTIONS]\n", prog);
		412	printf("Options:\n");
		413	printf(" -m, --model <name> Specify model to use (default: first model)\n");
		414	printf(" -p, --prompt <text> Specify prompt text (default: \"What is 2+2?\")\n");
		415	printf(" -c, --context <text> Specify context file\n");
		416	printf(" -v, --verbose Enable verbose logging\n");
		417	printf(" -h, --help Show this help message\n");
		418	}
		419
		420	int main(int argc, char **argv) {
		421	const char *model_name = NULL;
		422	const char *prompt = NULL;
		423	const char *context_file = NULL;
		424	int verbose = 0;
		425
		426	int n_predict = 64;
		427
		428	static struct option long_options[] = {
		429	{"model", required_argument, 0, 'm'},
		430	{"prompt", required_argument, 0, 'p'},
		431	{"context", required_argument, 0, 'c'},
		432	{"verbose", no_argument, 0, 'v'},
		433	{"help", no_argument, 0, 'h'},
		434	{0, 0, 0, 0}
		435	};
		436
		437	int opt;
		438	int option_index = 0;
		439	while ((opt = getopt_long(argc, argv, "m:p:c:vh", long_options, &option_index)) != -1) {
		440	switch (opt) {
		441	case 'm':
		442	model_name = optarg;
		443	break;
		444	case 'p':
		445	prompt = optarg;
		446	break;
		447	case 'c':
		448	context_file = optarg;
		449	break;
		450	case 'v':
		451	verbose = 1;
		452	break;
		453	case 'h':
		454	show_help(argv[0]);
		455	return 0;
		456	default:
		457	fprintf(stderr, "Usage: %s [-m model] [-p prompt] [-h]\n", argv[0]);
		458	return 1;
		459	}
		460	}
		461
		462	if (verbose == 0) {
		463	llama_log_set(llama_log_callback, NULL);
		464	}
		465
		466	if (prompt == NULL) {
		467	printf("Prompt must be provided. Exiting...");
		468	return 1;
		469	}
		470
		471	if (context_file == NULL) {
		472	printf("Context file must be provided. Exiting...");
		473	return 1;
		474	}
		475
		476	char *context = generate_context(model_name, context_file, prompt);
		477	if (context == NULL) {
		478	return 1;
		479	}
		480
		481	int rc = execute_prompt(model_name, prompt, context, n_predict);
		482	free(context);
		483	return rc;
		484	}