1#pragma once
  2
  3#include "utils.h"
  4
  5#include <cctype>
  6#include <map>
  7#include <stdexcept>
  8#include <string>
  9#include <vector>
 10
 11namespace jinja {
 12
 13struct token {
 14    enum type {
 15        eof, // end of source
 16        text, // The text between Jinja statements or expressions
 17
 18        numeric_literal, // e.g., 123, 1.0
 19        string_literal, // 'string'
 20        identifier, // Variables, functions, statements, booleans, etc.
 21        equals, // =
 22        open_paren, // (
 23        close_paren, // )
 24        open_statement, // {%
 25        close_statement, // %}
 26        open_expression, // {{
 27        close_expression, // }}
 28        open_square_bracket, // [
 29        close_square_bracket, // ]
 30        open_curly_bracket, // {
 31        close_curly_bracket, // }
 32        comma, // ,
 33        dot, // .
 34        colon, // :
 35        pipe, // |
 36
 37        call_operator, // ()
 38        additive_binary_operator, // + - ~
 39        multiplicative_binary_operator, // * / %
 40        comparison_binary_operator, // < > <= >= == !=
 41        unary_operator, // ! - +
 42        comment, // {# ... #}
 43    };
 44    type t;
 45    std::string value;
 46    size_t pos;
 47};
 48
 49static std::string type_to_string(token::type t) {
 50    switch (t) {
 51        case token::eof: return "eof";
 52        case token::text: return "text";
 53        case token::numeric_literal: return "numeric_literal";
 54        case token::string_literal: return "string_literal";
 55        case token::identifier: return "identifier";
 56        case token::equals: return "equals";
 57        case token::open_paren: return "open_paren";
 58        case token::close_paren: return "close_paren";
 59        case token::open_statement: return "open_statement";
 60        case token::close_statement: return "close_statement";
 61        case token::open_expression: return "open_expression";
 62        case token::close_expression: return "close_expression";
 63        case token::open_square_bracket: return "open_square_bracket";
 64        case token::close_square_bracket: return "close_square_bracket";
 65        case token::open_curly_bracket: return "open_curly_bracket";
 66        case token::close_curly_bracket: return "close_curly_bracket";
 67        case token::comma: return "comma";
 68        case token::dot: return "dot";
 69        case token::colon: return "colon";
 70        case token::pipe: return "pipe";
 71        case token::call_operator: return "call_operator";
 72        case token::additive_binary_operator: return "additive_binary_operator";
 73        case token::multiplicative_binary_operator: return "multiplicative_binary_operator";
 74        case token::comparison_binary_operator: return "comparison_binary_operator";
 75        case token::unary_operator: return "unary_operator";
 76        case token::comment: return "comment";
 77        default: return "unknown";
 78    }
 79}
 80
 81struct lexer_result {
 82    std::vector<token> tokens;
 83    std::string source;
 84};
 85
 86struct lexer {
 87    const std::map<char, char> escape_chars = {
 88        {'n', '\n'},
 89        {'t', '\t'},
 90        {'r', '\r'},
 91        {'b', '\b'},
 92        {'f', '\f'},
 93        {'v', '\v'},
 94        {'\\', '\\'},
 95        {'\'', '\''},
 96        {'\"', '\"'},
 97    };
 98
 99    static bool is_word(char c) {
100        return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
101    }
102
103    static bool is_integer(char c) {
104        return std::isdigit(static_cast<unsigned char>(c));
105    }
106
107    const std::vector<std::pair<std::string, token::type>> ordered_mapping_table = {
108        // Trimmed control sequences
109        {"{%-", token::open_statement},
110        {"-%}", token::close_statement},
111        {"{{-", token::open_expression},
112        {"-}}", token::close_expression},
113        // Control sequences
114        {"{%", token::open_statement},
115        {"%}", token::close_statement},
116        {"{{", token::open_expression},
117        {"}}", token::close_expression},
118        // Single character tokens
119        {"(", token::open_paren},
120        {")", token::close_paren},
121        {"{", token::open_curly_bracket},
122        {"}", token::close_curly_bracket},
123        {"[", token::open_square_bracket},
124        {"]", token::close_square_bracket},
125        {",", token::comma},
126        {".", token::dot},
127        {":", token::colon},
128        {"|", token::pipe},
129        // Comparison operators
130        {"<=", token::comparison_binary_operator},
131        {">=", token::comparison_binary_operator},
132        {"==", token::comparison_binary_operator},
133        {"!=", token::comparison_binary_operator},
134        {"<", token::comparison_binary_operator},
135        {">", token::comparison_binary_operator},
136        // Arithmetic operators
137        {"+", token::additive_binary_operator},
138        {"-", token::additive_binary_operator},
139        {"~", token::additive_binary_operator},
140        {"*", token::multiplicative_binary_operator},
141        {"/", token::multiplicative_binary_operator},
142        {"%", token::multiplicative_binary_operator},
143        // Assignment operator
144        {"=", token::equals},
145    };
146
147    // tokenize the source string into a list of tokens
148    // may throw lexer_exception on error
149    lexer_result tokenize(const std::string & source);
150};
151
152struct lexer_exception : public std::runtime_error {
153    lexer_exception(const std::string & msg, const std::string & source, size_t pos)
154        : std::runtime_error(fmt_error_with_source("lexer", msg, source, pos)) {}
155};
156
157} // namespace jinja