1//@ts-check
  2// Helpers to work with different data types
  3// by Humans for All
  4//
  5
  6/**
  7 * Given the limited context size of local LLMs and , many a times when context gets filled
  8 * between the prompt and the response, it can lead to repeating text garbage generation.
  9 * And many a times setting penalty wrt repeatation leads to over-intelligent garbage
 10 * repeatation with slight variations. These garbage inturn can lead to overloading of the
 11 * available model context, leading to less valuable response for subsequent prompts/queries,
 12 * if chat history is sent to ai model.
 13 *
 14 * So two simple minded garbage trimming logics are experimented below.
 15 * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and
 16 * * another based on char-histogram-driven garbage trimming.
 17 *   * in future characteristic of histogram over varying lengths could be used to allow for
 18 *     a more aggressive and adaptive trimming logic.
 19 */
 20
 21
 22/**
 23 * Simple minded logic to help remove repeating garbage at end of the string.
 24 * The repeatation needs to be perfectly matching.
 25 *
 26 * The logic progressively goes on probing for longer and longer substring based
 27 * repeatation, till there is no longer repeatation. Inturn picks the one with
 28 * the longest chain.
 29 *
 30 * @param {string} sIn
 31 * @param {number} maxSubL
 32 * @param {number} maxMatchLenThreshold
 33 */
 34export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) {
 35    let rCnt = [0];
 36    let maxMatchLen = maxSubL;
 37    let iMML = -1;
 38    for(let subL=1; subL < maxSubL; subL++) {
 39        rCnt.push(0);
 40        let i;
 41        let refS = sIn.substring(sIn.length-subL, sIn.length);
 42        for(i=sIn.length; i > 0; i -= subL) {
 43            let curS = sIn.substring(i-subL, i);
 44            if (refS != curS) {
 45                let curMatchLen = rCnt[subL]*subL;
 46                if (maxMatchLen < curMatchLen) {
 47                    maxMatchLen = curMatchLen;
 48                    iMML = subL;
 49                }
 50                break;
 51            }
 52            rCnt[subL] += 1;
 53        }
 54    }
 55    console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt);
 56    if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) {
 57        return {trimmed: false, data: sIn};
 58    }
 59    console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen);
 60    let iEnd = sIn.length - maxMatchLen;
 61    return { trimmed: true, data: sIn.substring(0, iEnd) };
 62}
 63
 64
 65/**
 66 * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
 67 * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
 68 * This ensures that even if there are multiple runs of garbage with different patterns, the
 69 * logic still tries to munch through them.
 70 *
 71 * @param {string} sIn
 72 * @param {number} maxSubL
 73 * @param {number | undefined} [maxMatchLenThreshold]
 74 */
 75export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) {
 76    let sCur = sIn;
 77    let sSaved = "";
 78    let iTry = 0;
 79    while(true) {
 80        let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold);
 81        if (got.trimmed != true) {
 82            if (iTry == 0) {
 83                sSaved = got.data;
 84            }
 85            iTry += 1;
 86            if (iTry >= skipMax) {
 87                return sSaved;
 88            }
 89            got.data = got.data.substring(0,got.data.length-1);
 90        } else {
 91            iTry = 0;
 92        }
 93        sCur = got.data;
 94    }
 95}
 96
 97
 98/**
 99 * A simple minded try trim garbage at end using histogram driven characteristics.
100 * There can be variation in the repeatations, as long as no new char props up.
101 *
102 * This tracks the chars and their frequency in a specified length of substring at the end
103 * and inturn checks if moving further into the generated text from the end remains within
104 * the same char subset or goes beyond it and based on that either trims the string at the
105 * end or not. This allows to filter garbage at the end, including even if there are certain
106 * kind of small variations in the repeated text wrt position of seen chars.
107 *
108 * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that
109 * a given type of char ie numerals or alphabets or other types dont cross the specified
110 * maxType limit. This allows intermixed text garbage to be identified and trimmed.
111 *
112 * ALERT: This is not perfect and only provides a rough garbage identification logic.
113 * Also it currently only differentiates between character classes wrt english.
114 *
115 * @param {string} sIn
116 * @param {number} maxType
117 * @param {number} maxUniq
118 * @param {number} maxMatchLenThreshold
119 */
120export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) {
121    if (sIn.length < maxMatchLenThreshold) {
122        return { trimmed: false, data: sIn };
123    }
124    let iAlp = 0;
125    let iNum = 0;
126    let iOth = 0;
127    // Learn
128    let hist = {};
129    let iUniq = 0;
130    for(let i=0; i<maxMatchLenThreshold; i++) {
131        let c = sIn[sIn.length-1-i];
132        if (c in hist) {
133            hist[c] += 1;
134        } else {
135            if(c.match(/[0-9]/) != null) {
136                iNum += 1;
137            } else if(c.match(/[A-Za-z]/) != null) {
138                iAlp += 1;
139            } else {
140                iOth += 1;
141            }
142            iUniq += 1;
143            if (iUniq >= maxUniq) {
144                break;
145            }
146            hist[c] = 1;
147        }
148    }
149    console.debug("DBUG:TrimHistGarbage:", hist);
150    if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) {
151        return { trimmed: false, data: sIn };
152    }
153    // Catch and Trim
154    for(let i=0; i < sIn.length; i++) {
155        let c = sIn[sIn.length-1-i];
156        if (!(c in hist)) {
157            if (i < maxMatchLenThreshold) {
158                return { trimmed: false, data: sIn };
159            }
160            console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i);
161            return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) };
162        }
163    }
164    console.debug("DBUG:TrimHistGarbage:Trimmed fully");
165    return { trimmed: true, data: "" };
166}
167
168/**
169 * Keep trimming repeatedly using hist_garbage logic, till you no longer can.
170 * This ensures that even if there are multiple runs of garbage with different patterns,
171 * the logic still tries to munch through them.
172 *
173 * @param {any} sIn
174 * @param {number} maxType
175 * @param {number} maxUniq
176 * @param {number} maxMatchLenThreshold
177 */
178export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) {
179    let sCur = sIn;
180    while (true) {
181        let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold);
182        if (!got.trimmed) {
183            return got.data;
184        }
185        sCur = got.data;
186    }
187}
188
189/**
190 * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as
191 * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying.
192 * @param {string} sIn
193 */
194export function trim_garbage_at_end(sIn) {
195    let sCur = sIn;
196    for(let i=0; i<2; i++) {
197        sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72);
198        sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12);
199    }
200    return sCur;
201}
202
203
204/**
205 * NewLines array helper.
206 * Allow for maintaining a list of lines.
207 * Allow for a line to be builtup/appended part by part.
208 */
209export class NewLines {
210
211    constructor() {
212        /** @type {string[]} */
213        this.lines = [];
214    }
215
216    /**
217     * Extracts lines from the passed string and inturn either
218     * append to a previous partial line or add a new line.
219     * @param {string} sLines
220     */
221    add_append(sLines) {
222        let aLines = sLines.split("\n");
223        let lCnt = 0;
224        for(let line of aLines) {
225            lCnt += 1;
226            // Add back newline removed if any during split
227            if (lCnt < aLines.length) {
228                line += "\n";
229            } else {
230                if (sLines.endsWith("\n")) {
231                    line += "\n";
232                }
233            }
234            // Append if required
235            if (lCnt == 1) {
236                let lastLine = this.lines[this.lines.length-1];
237                if (lastLine != undefined) {
238                    if (!lastLine.endsWith("\n")) {
239                        this.lines[this.lines.length-1] += line;
240                        continue;
241                    }
242                }
243            }
244            // Add new line
245            this.lines.push(line);
246        }
247    }
248
249    /**
250     * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest]
251     * Optionally control whether only full lines (ie those with newline at end) will be returned
252     * or will a partial line without a newline at end (can only be the last line) be returned.
253     * @param {boolean} bFullWithNewLineOnly
254     */
255    shift(bFullWithNewLineOnly=true) {
256        let line = this.lines[0];
257        if (line == undefined) {
258            return undefined;
259        }
260        if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){
261            return undefined;
262        }
263        return this.lines.shift();
264    }
265
266}