summaryrefslogtreecommitdiff
path: root/examples/dte/regexp.c
blob: dc4eb0f46c9f965e1c4fe428bb24cd5d62f9f2cd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#include <errno.h>
#include <stdlib.h>
#include "regexp.h"
#include "error.h"
#include "util/debug.h"
#include "util/hashmap.h"
#include "util/str-util.h"
#include "util/xmalloc.h"
#include "util/xsnprintf.h"

static HashMap interned_regexps;

bool regexp_error_msg(const regex_t *re, const char *pattern, int err)
{
    char msg[1024];
    regerror(err, re, msg, sizeof(msg));
    return error_msg("%s: %s", msg, pattern);
}

bool regexp_compile_internal(regex_t *re, const char *pattern, int flags)
{
    int err = regcomp(re, pattern, flags);
    if (err) {
        return regexp_error_msg(re, pattern, err);
    }
    return true;
}

void regexp_compile_or_fatal_error(regex_t *re, const char *pattern, int flags)
{
    // Note: DEFAULT_REGEX_FLAGS isn't used here because this function
    // is only used for compiling built-in patterns, where we explicitly
    // avoid using "enhanced" features
    int err = regcomp(re, pattern, flags | REG_EXTENDED);
    if (unlikely(err)) {
        char msg[1024];
        regerror(err, re, msg, sizeof(msg));
        fatal_error(msg, EINVAL);
    }
}

bool regexp_exec (
    const regex_t *re,
    const char *buf,
    size_t size,
    size_t nmatch,
    regmatch_t *pmatch,
    int flags
) {
    // "If REG_STARTEND is specified, pmatch must point to at least one
    // regmatch_t (even if nmatch is 0 or REG_NOSUB was specified), to
    // hold the input offsets for REG_STARTEND."
    // -- https://man.openbsd.org/regex.3
    BUG_ON(!pmatch);

// ASan's __interceptor_regexec() doesn't support REG_STARTEND
#if defined(REG_STARTEND) && !defined(ASAN_ENABLED) && !defined(MSAN_ENABLED)
    pmatch[0].rm_so = 0;
    pmatch[0].rm_eo = size;
    return !regexec(re, buf, nmatch, pmatch, flags | REG_STARTEND);
#else
    // Buffer must be null-terminated if REG_STARTEND isn't supported
    char *tmp = xstrcut(buf, size);
    int ret = !regexec(re, tmp, nmatch, pmatch, flags);
    free(tmp);
    return ret;
#endif
}

// Check which word boundary tokens are supported by regcomp(3)
// (if any) and initialize `rwbt` with them for later use
bool regexp_init_word_boundary_tokens(RegexpWordBoundaryTokens *rwbt)
{
    static const char text[] = "SSfooEE SSfoo fooEE foo SSfooEE";
    const regoff_t match_start = 20, match_end = 23;
    static const RegexpWordBoundaryTokens pairs[] = {
        {"\\<", "\\>"},
        {"[[:<:]]", "[[:>:]]"},
        {"\\b", "\\b"},
    };

    BUG_ON(ARRAYLEN(text) <= match_end);
    BUG_ON(!mem_equal(text + match_start - 1, " foo ", 5));

    for (size_t i = 0; i < ARRAYLEN(pairs); i++) {
        const char *start = pairs[i].start;
        const char *end = pairs[i].end;
        char patt[32];
        xsnprintf(patt, sizeof(patt), "%s(foo)%s", start, end);
        regex_t re;
        if (regcomp(&re, patt, DEFAULT_REGEX_FLAGS) != 0) {
            continue;
        }
        regmatch_t m[2];
        bool match = !regexec(&re, text, ARRAYLEN(m), m, 0);
        regfree(&re);
        if (match && m[0].rm_so == match_start && m[0].rm_eo == match_end) {
            *rwbt = pairs[i];
            return true;
        }
    }

    return false;
}

void free_cached_regexp(CachedRegexp *cr)
{
    regfree(&cr->re);
    free(cr);
}

const InternedRegexp *regexp_intern(const char *pattern)
{
    if (pattern[0] == '\0') {
        return NULL;
    }

    InternedRegexp *ir = hashmap_get(&interned_regexps, pattern);
    if (ir) {
        return ir;
    }

    ir = xnew(InternedRegexp, 1);
    int err = regcomp(&ir->re, pattern, DEFAULT_REGEX_FLAGS | REG_NEWLINE | REG_NOSUB);
    if (unlikely(err)) {
        regexp_error_msg(&ir->re, pattern, err);
        free(ir);
        return NULL;
    }

    ir->str = xstrdup(pattern);
    return hashmap_insert(&interned_regexps, ir->str, ir);
}

bool regexp_is_interned(const char *pattern)
{
    return !!hashmap_find(&interned_regexps, pattern);
}

// Note: this does NOT free InternedRegexp::str, because it points at the
// same string as HashMapEntry::key and is already freed by hashmap_free()
static void free_interned_regexp(InternedRegexp *ir)
{
    regfree(&ir->re);
    free(ir);
}

void free_interned_regexps(void)
{
    hashmap_free(&interned_regexps, (FreeFunction)free_interned_regexp);
}