diff options
Diffstat (limited to 'llama.cpp/common/jinja/lexer.h')
| -rw-r--r-- | llama.cpp/common/jinja/lexer.h | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/llama.cpp/common/jinja/lexer.h b/llama.cpp/common/jinja/lexer.h new file mode 100644 index 0000000..439c857 --- /dev/null +++ b/llama.cpp/common/jinja/lexer.h @@ -0,0 +1,157 @@ +#pragma once + +#include "utils.h" + +#include <cctype> +#include <map> +#include <stdexcept> +#include <string> +#include <vector> + +namespace jinja { + +struct token { + enum type { + eof, // end of source + text, // The text between Jinja statements or expressions + + numeric_literal, // e.g., 123, 1.0 + string_literal, // 'string' + identifier, // Variables, functions, statements, booleans, etc. + equals, // = + open_paren, // ( + close_paren, // ) + open_statement, // {% + close_statement, // %} + open_expression, // {{ + close_expression, // }} + open_square_bracket, // [ + close_square_bracket, // ] + open_curly_bracket, // { + close_curly_bracket, // } + comma, // , + dot, // . + colon, // : + pipe, // | + + call_operator, // () + additive_binary_operator, // + - ~ + multiplicative_binary_operator, // * / % + comparison_binary_operator, // < > <= >= == != + unary_operator, // ! - + + comment, // {# ... #} + }; + type t; + std::string value; + size_t pos; +}; + +static std::string type_to_string(token::type t) { + switch (t) { + case token::eof: return "eof"; + case token::text: return "text"; + case token::numeric_literal: return "numeric_literal"; + case token::string_literal: return "string_literal"; + case token::identifier: return "identifier"; + case token::equals: return "equals"; + case token::open_paren: return "open_paren"; + case token::close_paren: return "close_paren"; + case token::open_statement: return "open_statement"; + case token::close_statement: return "close_statement"; + case token::open_expression: return "open_expression"; + case token::close_expression: return "close_expression"; + case token::open_square_bracket: return "open_square_bracket"; + case token::close_square_bracket: return "close_square_bracket"; + case token::open_curly_bracket: return "open_curly_bracket"; + case token::close_curly_bracket: return "close_curly_bracket"; + case token::comma: return "comma"; + case token::dot: return "dot"; + case token::colon: return "colon"; + case token::pipe: return "pipe"; + case token::call_operator: return "call_operator"; + case token::additive_binary_operator: return "additive_binary_operator"; + case token::multiplicative_binary_operator: return "multiplicative_binary_operator"; + case token::comparison_binary_operator: return "comparison_binary_operator"; + case token::unary_operator: return "unary_operator"; + case token::comment: return "comment"; + default: return "unknown"; + } +} + +struct lexer_result { + std::vector<token> tokens; + std::string source; +}; + +struct lexer { + const std::map<char, char> escape_chars = { + {'n', '\n'}, + {'t', '\t'}, + {'r', '\r'}, + {'b', '\b'}, + {'f', '\f'}, + {'v', '\v'}, + {'\\', '\\'}, + {'\'', '\''}, + {'\"', '\"'}, + }; + + static bool is_word(char c) { + return std::isalnum(static_cast<unsigned char>(c)) || c == '_'; + } + + static bool is_integer(char c) { + return std::isdigit(static_cast<unsigned char>(c)); + } + + const std::vector<std::pair<std::string, token::type>> ordered_mapping_table = { + // Trimmed control sequences + {"{%-", token::open_statement}, + {"-%}", token::close_statement}, + {"{{-", token::open_expression}, + {"-}}", token::close_expression}, + // Control sequences + {"{%", token::open_statement}, + {"%}", token::close_statement}, + {"{{", token::open_expression}, + {"}}", token::close_expression}, + // Single character tokens + {"(", token::open_paren}, + {")", token::close_paren}, + {"{", token::open_curly_bracket}, + {"}", token::close_curly_bracket}, + {"[", token::open_square_bracket}, + {"]", token::close_square_bracket}, + {",", token::comma}, + {".", token::dot}, + {":", token::colon}, + {"|", token::pipe}, + // Comparison operators + {"<=", token::comparison_binary_operator}, + {">=", token::comparison_binary_operator}, + {"==", token::comparison_binary_operator}, + {"!=", token::comparison_binary_operator}, + {"<", token::comparison_binary_operator}, + {">", token::comparison_binary_operator}, + // Arithmetic operators + {"+", token::additive_binary_operator}, + {"-", token::additive_binary_operator}, + {"~", token::additive_binary_operator}, + {"*", token::multiplicative_binary_operator}, + {"/", token::multiplicative_binary_operator}, + {"%", token::multiplicative_binary_operator}, + // Assignment operator + {"=", token::equals}, + }; + + // tokenize the source string into a list of tokens + // may throw lexer_exception on error + lexer_result tokenize(const std::string & source); +}; + +struct lexer_exception : public std::runtime_error { + lexer_exception(const std::string & msg, const std::string & source, size_t pos) + : std::runtime_error(fmt_error_with_source("lexer", msg, source, pos)) {} +}; + +} // namespace jinja |
