summaryrefslogtreecommitdiff
path: root/examples/dte/encoding.c
blob: 3fb87db8e3af9c9e01402b54ef3125f7c9e16e41 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#include "encoding.h"
#include "util/ascii.h"
#include "util/bsearch.h"
#include "util/debug.h"
#include "util/intern.h"
#include "util/str-util.h"

typedef struct {
    const char alias[8];
    EncodingType encoding;
} EncodingAlias;

static const char encoding_names[][16] = {
    [UTF8] = "UTF-8",
    [UTF16BE] = "UTF-16BE",
    [UTF16LE] = "UTF-16LE",
    [UTF32BE] = "UTF-32BE",
    [UTF32LE] = "UTF-32LE",
};

static const EncodingAlias encoding_aliases[] = {
    {"UCS-2", UTF16BE},
    {"UCS-2BE", UTF16BE},
    {"UCS-2LE", UTF16LE},
    {"UCS-4", UTF32BE},
    {"UCS-4BE", UTF32BE},
    {"UCS-4LE", UTF32LE},
    {"UCS2", UTF16BE},
    {"UCS4", UTF32BE},
    {"UTF-16", UTF16BE},
    {"UTF-32", UTF32BE},
    {"UTF16", UTF16BE},
    {"UTF16BE", UTF16BE},
    {"UTF16LE", UTF16LE},
    {"UTF32", UTF32BE},
    {"UTF32BE", UTF32BE},
    {"UTF32LE", UTF32LE},
    {"UTF8", UTF8},
};

static const ByteOrderMark boms[NR_ENCODING_TYPES] = {
    [UTF8] = {{0xef, 0xbb, 0xbf}, 3},
    [UTF16BE] = {{0xfe, 0xff}, 2},
    [UTF16LE] = {{0xff, 0xfe}, 2},
    [UTF32BE] = {{0x00, 0x00, 0xfe, 0xff}, 4},
    [UTF32LE] = {{0xff, 0xfe, 0x00, 0x00}, 4},
};

UNITTEST {
    CHECK_BSEARCH_ARRAY(encoding_aliases, alias, ascii_strcmp_icase);
}

static int enc_alias_cmp(const void *key, const void *elem)
{
    const EncodingAlias *a = key;
    const char *name = elem;
    return ascii_strcmp_icase(a->alias, name);
}

EncodingType lookup_encoding(const char *name)
{
    static_assert(ARRAYLEN(encoding_names) == NR_ENCODING_TYPES - 1);
    for (size_t i = 0; i < ARRAYLEN(encoding_names); i++) {
        if (ascii_streq_icase(name, encoding_names[i])) {
            return (EncodingType) i;
        }
    }

    const EncodingAlias *a = BSEARCH(name, encoding_aliases, enc_alias_cmp);
    return a ? a->encoding : UNKNOWN_ENCODING;
}

static const char *encoding_type_to_string(EncodingType type)
{
    if (type < NR_ENCODING_TYPES && type != UNKNOWN_ENCODING) {
        return str_intern(encoding_names[type]);
    }
    return NULL;
}

Encoding encoding_from_name(const char *name)
{
    const EncodingType type = lookup_encoding(name);
    const char *normalized_name;
    if (type == UNKNOWN_ENCODING) {
        char upper[256];
        size_t n;
        for (n = 0; n < sizeof(upper) && name[n]; n++) {
            upper[n] = ascii_toupper(name[n]);
        }
        normalized_name = mem_intern(upper, n);
    } else {
        normalized_name = encoding_type_to_string(type);
    }
    return (Encoding) {
        .type = type,
        .name = normalized_name
    };
}

Encoding encoding_from_type(EncodingType type)
{
    return (Encoding) {
        .type = type,
        .name = encoding_type_to_string(type)
    };
}

EncodingType detect_encoding_from_bom(const unsigned char *buf, size_t size)
{
    // Skip exhaustive checks if there's clearly no BOM
    if (size < 2 || ((unsigned int)buf[0]) - 1 < 0xEE) {
        return UNKNOWN_ENCODING;
    }

    // Iterate array backwards to ensure UTF32LE is checked before UTF16LE
    for (int i = NR_ENCODING_TYPES - 1; i >= 0; i--) {
        const unsigned int bom_len = boms[i].len;
        if (bom_len > 0 && size >= bom_len && mem_equal(buf, boms[i].bytes, bom_len)) {
            return (EncodingType) i;
        }
    }
    return UNKNOWN_ENCODING;
}

const ByteOrderMark *get_bom_for_encoding(EncodingType encoding)
{
    static_assert(ARRAYLEN(boms) == NR_ENCODING_TYPES);
    BUG_ON(encoding >= ARRAYLEN(boms));
    const ByteOrderMark *bom = &boms[encoding];
    return bom->len ? bom : NULL;
}