1#pragma once
2//
3// common/ngram-map.h: structures used to manage a map from n-grams to a list of m-grams
4//
5// These structures are used to do a lookup of n-grams followed by m-grams in token history.
6//
7// There are two algorithms implemented:
8// 1. ngram_simple: lookup of n-grams followed by m-grams in token history.
9// 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
10// The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
11//
12// ref: https://github.com/ggml-org/llama.cpp/pull/18471
13//
14
15#include "llama.h"
16#include "common.h"
17
18#include <vector>
19
20// n-gram simple
21//
22
23// config of n-gram simple.
24struct common_ngram_simple_config {
25 uint16_t size_ngram; // size of n-grams to lookup in self-mode
26 uint16_t size_mgram; // size of m-grams to draft in self-mode
27};
28
29// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
30llama_tokens common_ngram_simple_draft(
31 const common_ngram_simple_config & config,
32 const llama_tokens & tokens, llama_token sampled);
33
34
35// n-gram map
36//
37
38// maximum number of m-gram values stored for each key n-gram.
39#define COMMON_NGRAM_MAX_VALUES 4
40
41// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index.
42#define COMMON_NGRAM_HASH_MAP_SIZE 262144
43
44// statistics of a m-gram after a known n-gram
45struct common_ngram_map_value {
46 size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
47 uint16_t value_num = 0; // number of occurrences of this value m-gram after the key n-gram (0 in an unused values-slot)
48 int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused)
49};
50
51// statistics of a n-gram
52struct common_ngram_map_key {
53 size_t key_idx; // index of key n-gram in token-history
54 size_t stat_idx; // index of last token of stastistics computation (key_num, values)
55
56 uint16_t key_num; // number of occurrences of this key n-gram in token-history
57 common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
58};
59
60// map from n-grams to following m-grams in token-history
61struct common_ngram_map {
62 uint16_t size_key; // size of key n-grams
63 uint16_t size_value; // size of value m-grams
64
65 bool key_only; // true if only key n-grams are used, no values.
66
67 std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
68 uint16_t min_hits; // minimum number of key hits to consider a draft
69
70 bool show_key_map_stats = false; // true, if statistics of the key_map should be printed.
71
72 common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
73 uint16_t min_hits)
74 : size_key(sz_key), size_value(sz_value), key_only(only_keys),
75 min_hits(min_hits) {
76 key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
77 }
78
79 // In reasoning chats the previous reasoning block will be removed from context history.
80 // A rebuild of the ngram map is needed after that.
81
82 size_t size_last_begin = 0; // number of tokens at previous start of generation
83
84 bool last_draft_created = false; // true if a draft was created at last call.
85 size_t last_draft_key_idx = 0; // index of last key used for draft generation (0 = no draft)
86 uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
87
88 size_t idx_last_check = 0; // index of last check in context history
89
90 // optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused.
91 //
92 // uint32_t instead of size_t (size of current histories is << UINT32_MAX)
93 std::vector<uint32_t> key_map; // key_map[hash] = index of ngram in context window
94 uint32_t key_map_last_idx = 0; // index of the last ngram added to key_map
95};
96
97// Initialize the n-gram map with the given token history.
98// map: the ngram map to initialize.
99// tokens: the token history to base the map on.
100void common_ngram_map_begin(
101 common_ngram_map & map,
102 const llama_tokens & tokens);
103
104// Searches for the n-gram in the history and checks whether a draft sequence should be generated.
105// map: the ngram map to search in.
106// inp: the tokens generated so far.
107// sampled: the token that was just sampled.
108// draft: vector to store the draft tokens, initially empty.
109void common_ngram_map_draft(
110 common_ngram_map & map,
111 const llama_tokens & inp, llama_token sampled,
112 llama_tokens & draft);
113
114// Update the statistics of a value after a draft was processed.
115void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted);