summaryrefslogtreecommitdiff
path: root/llama.cpp/common/jinja/lexer.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/common/jinja/lexer.h')
-rw-r--r--llama.cpp/common/jinja/lexer.h157
1 files changed, 157 insertions, 0 deletions
diff --git a/llama.cpp/common/jinja/lexer.h b/llama.cpp/common/jinja/lexer.h
new file mode 100644
index 0000000..439c857
--- /dev/null
+++ b/llama.cpp/common/jinja/lexer.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#include "utils.h"
+
+#include <cctype>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace jinja {
+
+struct token {
+ enum type {
+ eof, // end of source
+ text, // The text between Jinja statements or expressions
+
+ numeric_literal, // e.g., 123, 1.0
+ string_literal, // 'string'
+ identifier, // Variables, functions, statements, booleans, etc.
+ equals, // =
+ open_paren, // (
+ close_paren, // )
+ open_statement, // {%
+ close_statement, // %}
+ open_expression, // {{
+ close_expression, // }}
+ open_square_bracket, // [
+ close_square_bracket, // ]
+ open_curly_bracket, // {
+ close_curly_bracket, // }
+ comma, // ,
+ dot, // .
+ colon, // :
+ pipe, // |
+
+ call_operator, // ()
+ additive_binary_operator, // + - ~
+ multiplicative_binary_operator, // * / %
+ comparison_binary_operator, // < > <= >= == !=
+ unary_operator, // ! - +
+ comment, // {# ... #}
+ };
+ type t;
+ std::string value;
+ size_t pos;
+};
+
+static std::string type_to_string(token::type t) {
+ switch (t) {
+ case token::eof: return "eof";
+ case token::text: return "text";
+ case token::numeric_literal: return "numeric_literal";
+ case token::string_literal: return "string_literal";
+ case token::identifier: return "identifier";
+ case token::equals: return "equals";
+ case token::open_paren: return "open_paren";
+ case token::close_paren: return "close_paren";
+ case token::open_statement: return "open_statement";
+ case token::close_statement: return "close_statement";
+ case token::open_expression: return "open_expression";
+ case token::close_expression: return "close_expression";
+ case token::open_square_bracket: return "open_square_bracket";
+ case token::close_square_bracket: return "close_square_bracket";
+ case token::open_curly_bracket: return "open_curly_bracket";
+ case token::close_curly_bracket: return "close_curly_bracket";
+ case token::comma: return "comma";
+ case token::dot: return "dot";
+ case token::colon: return "colon";
+ case token::pipe: return "pipe";
+ case token::call_operator: return "call_operator";
+ case token::additive_binary_operator: return "additive_binary_operator";
+ case token::multiplicative_binary_operator: return "multiplicative_binary_operator";
+ case token::comparison_binary_operator: return "comparison_binary_operator";
+ case token::unary_operator: return "unary_operator";
+ case token::comment: return "comment";
+ default: return "unknown";
+ }
+}
+
+struct lexer_result {
+ std::vector<token> tokens;
+ std::string source;
+};
+
+struct lexer {
+ const std::map<char, char> escape_chars = {
+ {'n', '\n'},
+ {'t', '\t'},
+ {'r', '\r'},
+ {'b', '\b'},
+ {'f', '\f'},
+ {'v', '\v'},
+ {'\\', '\\'},
+ {'\'', '\''},
+ {'\"', '\"'},
+ };
+
+ static bool is_word(char c) {
+ return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
+ }
+
+ static bool is_integer(char c) {
+ return std::isdigit(static_cast<unsigned char>(c));
+ }
+
+ const std::vector<std::pair<std::string, token::type>> ordered_mapping_table = {
+ // Trimmed control sequences
+ {"{%-", token::open_statement},
+ {"-%}", token::close_statement},
+ {"{{-", token::open_expression},
+ {"-}}", token::close_expression},
+ // Control sequences
+ {"{%", token::open_statement},
+ {"%}", token::close_statement},
+ {"{{", token::open_expression},
+ {"}}", token::close_expression},
+ // Single character tokens
+ {"(", token::open_paren},
+ {")", token::close_paren},
+ {"{", token::open_curly_bracket},
+ {"}", token::close_curly_bracket},
+ {"[", token::open_square_bracket},
+ {"]", token::close_square_bracket},
+ {",", token::comma},
+ {".", token::dot},
+ {":", token::colon},
+ {"|", token::pipe},
+ // Comparison operators
+ {"<=", token::comparison_binary_operator},
+ {">=", token::comparison_binary_operator},
+ {"==", token::comparison_binary_operator},
+ {"!=", token::comparison_binary_operator},
+ {"<", token::comparison_binary_operator},
+ {">", token::comparison_binary_operator},
+ // Arithmetic operators
+ {"+", token::additive_binary_operator},
+ {"-", token::additive_binary_operator},
+ {"~", token::additive_binary_operator},
+ {"*", token::multiplicative_binary_operator},
+ {"/", token::multiplicative_binary_operator},
+ {"%", token::multiplicative_binary_operator},
+ // Assignment operator
+ {"=", token::equals},
+ };
+
+ // tokenize the source string into a list of tokens
+ // may throw lexer_exception on error
+ lexer_result tokenize(const std::string & source);
+};
+
+struct lexer_exception : public std::runtime_error {
+ lexer_exception(const std::string & msg, const std::string & source, size_t pos)
+ : std::runtime_error(fmt_error_with_source("lexer", msg, source, pos)) {}
+};
+
+} // namespace jinja