Added partial matching and introduced threads

author: Mitja Felicijan <mitja.felicijan@gmail.com> 2023-11-09 23:19:53 +0100
committer: Mitja Felicijan <mitja.felicijan@gmail.com> 2023-11-09 23:19:53 +0100
commit: 1566b6faa8534118c3566188181367cd0868468f (patch)
tree: 1de8d4b369efb5e592685a31088f798a6b63ffa1 /examples/dte/convert.c
parent: 349991bf6efe473ab9a5cbdae0a8114d72b997e3 (diff)
download: crep-1566b6faa8534118c3566188181367cd0868468f.tar.gz
1 files changed, 581 insertions, 0 deletions
diff --git a/examples/dte/convert.c b/examples/dte/convert.c
new file mode 100644
index 0000000..2020ee9
--- /dev/null
+++ b/examples/dte/convert.c
@@ -0,0 +1,581 @@
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include "convert.h"
+#include "util/debug.h"
+#include "util/intern.h"
+#include "util/log.h"
+#include "util/str-util.h"
+#include "util/utf8.h"
+#include "util/xmalloc.h"
+#include "util/xreadwrite.h"
+
+struct FileEncoder {
+    struct cconv *cconv;
+    unsigned char *nbuf;
+    size_t nsize;
+    bool crlf;
+    int fd;
+};
+
+struct FileDecoder {
+    const char *encoding;
+    const unsigned char *ibuf;
+    ssize_t ipos, isize;
+    struct cconv *cconv;
+    bool (*read_line)(struct FileDecoder *dec, const char **linep, size_t *lenp);
+};
+
+const char *file_decoder_get_encoding(const FileDecoder *dec)
+{
+    return dec->encoding;
+}
+
+static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+    const char *line = dec->ibuf + dec->ipos;
+    const char *nl = memchr(line, '\n', dec->isize - dec->ipos);
+    size_t len;
+
+    if (nl) {
+        len = nl - line;
+        dec->ipos += len + 1;
+    } else {
+        len = dec->isize - dec->ipos;
+        if (len == 0) {
+            return false;
+        }
+        dec->ipos += len;
+    }
+
+    *linep = line;
+    *lenp = len;
+    return true;
+}
+
+static size_t unix_to_dos (
+    FileEncoder *enc,
+    const unsigned char *buf,
+    size_t size
+) {
+    if (enc->nsize < size * 2) {
+        enc->nsize = size * 2;
+        xrenew(enc->nbuf, enc->nsize);
+    }
+    size_t d = 0;
+    for (size_t s = 0; s < size; s++) {
+        unsigned char ch = buf[s];
+        if (ch == '\n') {
+            enc->nbuf[d++] = '\r';
+        }
+        enc->nbuf[d++] = ch;
+    }
+    return d;
+}
+
+#ifdef ICONV_DISABLE // iconv not available; use basic, UTF-8 implementation:
+
+bool conversion_supported_by_iconv (
+    const char* UNUSED_ARG(from),
+    const char* UNUSED_ARG(to)
+) {
+    errno = EINVAL;
+    return false;
+}
+
+FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd)
+{
+    if (unlikely(encoding->type != UTF8)) {
+        errno = EINVAL;
+        return NULL;
+    }
+    FileEncoder *enc = xnew0(FileEncoder, 1);
+    enc->crlf = crlf;
+    enc->fd = fd;
+    return enc;
+}
+
+void free_file_encoder(FileEncoder *enc)
+{
+    free(enc->nbuf);
+    free(enc);
+}
+
+ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n)
+{
+    if (enc->crlf) {
+        n = unix_to_dos(enc, buf, n);
+        buf = enc->nbuf;
+    }
+    return xwrite_all(enc->fd, buf, n);
+}
+
+size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
+{
+    return 0;
+}
+
+FileDecoder *new_file_decoder(const char *encoding, const unsigned char *buf, size_t n)
+{
+    if (unlikely(encoding && !streq(encoding, "UTF-8"))) {
+        errno = EINVAL;
+        return NULL;
+    }
+    FileDecoder *dec = xnew0(FileDecoder, 1);
+    dec->ibuf = buf;
+    dec->isize = n;
+    return dec;
+}
+
+void free_file_decoder(FileDecoder *dec)
+{
+    free(dec);
+}
+
+bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+    return read_utf8_line(dec, linep, lenp);
+}
+
+#else // ICONV_DISABLE is undefined; use full iconv implementation:
+
+#include <iconv.h>
+
+static const unsigned char replacement[2] = "\xc2\xbf"; // U+00BF
+
+struct cconv {
+    iconv_t cd;
+    char *obuf;
+    size_t osize;
+    size_t opos;
+    size_t consumed;
+    size_t errors;
+
+    // Temporary input buffer
+    char tbuf[16];
+    size_t tcount;
+
+    // Replacement character 0xBF (inverted question mark)
+    char rbuf[4];
+    size_t rcount;
+
+    // Input character size in bytes, or zero for UTF-8
+    size_t char_size;
+};
+
+static struct cconv *create(iconv_t cd)
+{
+    struct cconv *c = xnew0(struct cconv, 1);
+    c->cd = cd;
+    c->osize = 8192;
+    c->obuf = xmalloc(c->osize);
+    return c;
+}
+
+static size_t encoding_char_size(const char *encoding)
+{
+    if (str_has_prefix(encoding, "UTF-16")) {
+        return 2;
+    }
+    if (str_has_prefix(encoding, "UTF-32")) {
+        return 4;
+    }
+    return 1;
+}
+
+static size_t iconv_wrapper (
+    iconv_t cd,
+    const char **restrict inbuf,
+    size_t *restrict inbytesleft,
+    char **restrict outbuf,
+    size_t *restrict outbytesleft
+) {
+    // POSIX defines the second parameter of iconv(3) as "char **restrict"
+    // but NetBSD declares it as "const char **restrict"
+#ifdef __NetBSD__
+    const char **restrict in = inbuf;
+#else
+    char **restrict in = (char **restrict)inbuf;
+#endif
+
+    return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
+}
+
+static void encode_replacement(struct cconv *c)
+{
+    const char *ib = replacement;
+    char *ob = c->rbuf;
+    size_t ic = sizeof(replacement);
+    size_t oc = sizeof(c->rbuf);
+    size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
+
+    if (rc == (size_t)-1) {
+        c->rbuf[0] = '\xbf';
+        c->rcount = 1;
+    } else {
+        c->rcount = ob - c->rbuf;
+    }
+}
+
+static void resize_obuf(struct cconv *c)
+{
+    c->osize *= 2;
+    xrenew(c->obuf, c->osize);
+}
+
+static void add_replacement(struct cconv *c)
+{
+    if (c->osize - c->opos < 4) {
+        resize_obuf(c);
+    }
+
+    memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
+    c->opos += c->rcount;
+}
+
+static size_t handle_invalid(struct cconv *c, const char *buf, size_t count)
+{
+    LOG_DEBUG("%zu %zu", c->char_size, count);
+    add_replacement(c);
+    if (c->char_size == 0) {
+        // Converting from UTF-8
+        size_t idx = 0;
+        CodePoint u = u_get_char(buf, count, &idx);
+        LOG_DEBUG("U+%04" PRIX32, u);
+        return idx;
+    }
+    if (c->char_size > count) {
+        // wtf
+        return 1;
+    }
+    return c->char_size;
+}
+
+static int xiconv(struct cconv *c, const char **ib, size_t *ic)
+{
+    while (1) {
+        char *ob = c->obuf + c->opos;
+        size_t oc = c->osize - c->opos;
+        size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
+        c->opos = ob - c->obuf;
+        if (rc == (size_t)-1) {
+            switch (errno) {
+            case EILSEQ:
+                c->errors++;
+                // Reset
+                iconv(c->cd, NULL, NULL, NULL, NULL);
+                return errno;
+            case EINVAL:
+                return errno;
+            case E2BIG:
+                resize_obuf(c);
+                continue;
+            default:
+                BUG("iconv: %s", strerror(errno));
+            }
+        } else {
+            c->errors += rc;
+        }
+        return 0;
+    }
+}
+
+static size_t convert_incomplete(struct cconv *c, const char *input, size_t len)
+{
+    size_t ipos = 0;
+    while (c->tcount < sizeof(c->tbuf) && ipos < len) {
+        c->tbuf[c->tcount++] = input[ipos++];
+        const char *ib = c->tbuf;
+        size_t ic = c->tcount;
+        int rc = xiconv(c, &ib, &ic);
+        if (ic > 0) {
+            memmove(c->tbuf, ib, ic);
+        }
+        c->tcount = ic;
+        if (rc == EINVAL) {
+            // Incomplete character at end of input buffer; try again
+            // with more input data
+            continue;
+        }
+        if (rc == EILSEQ) {
+            // Invalid multibyte sequence
+            size_t skip = handle_invalid(c, c->tbuf, c->tcount);
+            c->tcount -= skip;
+            if (c->tcount > 0) {
+                LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
+                memmove(c->tbuf, c->tbuf + skip, c->tcount);
+                continue;
+            }
+            return ipos;
+        }
+        break;
+    }
+
+    LOG_DEBUG("%zu %zu", ipos, c->tcount);
+    return ipos;
+}
+
+static void cconv_process(struct cconv *c, const char *input, size_t len)
+{
+    if (c->consumed > 0) {
+        size_t fill = c->opos - c->consumed;
+        memmove(c->obuf, c->obuf + c->consumed, fill);
+        c->opos = fill;
+        c->consumed = 0;
+    }
+
+    if (c->tcount > 0) {
+        size_t ipos = convert_incomplete(c, input, len);
+        input += ipos;
+        len -= ipos;
+    }
+
+    const char *ib = input;
+    for (size_t ic = len; ic > 0; ) {
+        int r = xiconv(c, &ib, &ic);
+        if (r == EINVAL) {
+            // Incomplete character at end of input buffer
+            if (ic < sizeof(c->tbuf)) {
+                memcpy(c->tbuf, ib, ic);
+                c->tcount = ic;
+            } else {
+                // FIXME
+            }
+            ic = 0;
+            continue;
+        }
+        if (r == EILSEQ) {
+            // Invalid multibyte sequence
+            size_t skip = handle_invalid(c, ib, ic);
+            ic -= skip;
+            ib += skip;
+            continue;
+        }
+    }
+}
+
+static struct cconv *cconv_to_utf8(const char *encoding)
+{
+    iconv_t cd = iconv_open("UTF-8", encoding);
+    if (cd == (iconv_t)-1) {
+        return NULL;
+    }
+    struct cconv *c = create(cd);
+    memcpy(c->rbuf, replacement, sizeof(replacement));
+    c->rcount = sizeof(replacement);
+    c->char_size = encoding_char_size(encoding);
+    return c;
+}
+
+static struct cconv *cconv_from_utf8(const char *encoding)
+{
+    iconv_t cd = iconv_open(encoding, "UTF-8");
+    if (cd == (iconv_t)-1) {
+        return NULL;
+    }
+    struct cconv *c = create(cd);
+    encode_replacement(c);
+    return c;
+}
+
+static void cconv_flush(struct cconv *c)
+{
+    if (c->tcount > 0) {
+        // Replace incomplete character at end of input buffer
+        LOG_DEBUG("incomplete character at EOF");
+        add_replacement(c);
+        c->tcount = 0;
+    }
+}
+
+static char *cconv_consume_line(struct cconv *c, size_t *len)
+{
+    char *line = c->obuf + c->consumed;
+    char *nl = memchr(line, '\n', c->opos - c->consumed);
+    if (!nl) {
+        *len = 0;
+        return NULL;
+    }
+
+    size_t n = nl - line + 1;
+    c->consumed += n;
+    *len = n;
+    return line;
+}
+
+static char *cconv_consume_all(struct cconv *c, size_t *len)
+{
+    char *buf = c->obuf + c->consumed;
+    *len = c->opos - c->consumed;
+    c->consumed = c->opos;
+    return buf;
+}
+
+static void cconv_free(struct cconv *c)
+{
+    iconv_close(c->cd);
+    free(c->obuf);
+    free(c);
+}
+
+bool conversion_supported_by_iconv(const char *from, const char *to)
+{
+    if (unlikely(from[0] == '\0' || to[0] == '\0')) {
+        errno = EINVAL;
+        return false;
+    }
+
+    iconv_t cd = iconv_open(to, from);
+    if (cd == (iconv_t)-1) {
+        return false;
+    }
+
+    iconv_close(cd);
+    return true;
+}
+
+FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd)
+{
+    FileEncoder *enc = xnew0(FileEncoder, 1);
+    enc->crlf = crlf;
+    enc->fd = fd;
+
+    if (encoding->type != UTF8) {
+        enc->cconv = cconv_from_utf8(encoding->name);
+        if (!enc->cconv) {
+            free(enc);
+            return NULL;
+        }
+    }
+
+    return enc;
+}
+
+void free_file_encoder(FileEncoder *enc)
+{
+    if (enc->cconv) {
+        cconv_free(enc->cconv);
+    }
+    free(enc->nbuf);
+    free(enc);
+}
+
+// NOTE: buf must contain whole characters!
+ssize_t file_encoder_write (
+    FileEncoder *enc,
+    const unsigned char *buf,
+    size_t size
+) {
+    if (enc->crlf) {
+        size = unix_to_dos(enc, buf, size);
+        buf = enc->nbuf;
+    }
+    if (enc->cconv) {
+        cconv_process(enc->cconv, buf, size);
+        cconv_flush(enc->cconv);
+        buf = cconv_consume_all(enc->cconv, &size);
+    }
+    return xwrite_all(enc->fd, buf, size);
+}
+
+size_t file_encoder_get_nr_errors(const FileEncoder *enc)
+{
+    return enc->cconv ? enc->cconv->errors : 0;
+}
+
+static bool fill(FileDecoder *dec)
+{
+    if (dec->ipos == dec->isize) {
+        return false;
+    }
+
+    // Smaller than cconv.obuf to make realloc less likely
+    size_t max = 7 * 1024;
+
+    size_t icount = MIN(dec->isize - dec->ipos, max);
+    cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount);
+    dec->ipos += icount;
+    if (dec->ipos == dec->isize) {
+        // Must be flushed after all input has been fed
+        cconv_flush(dec->cconv);
+    }
+    return true;
+}
+
+static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+    char *line;
+    size_t len;
+    while (1) {
+        line = cconv_consume_line(dec->cconv, &len);
+        if (line || !fill(dec)) {
+            break;
+        }
+    }
+
+    if (line) {
+        // Newline not wanted
+        len--;
+    } else {
+        line = cconv_consume_all(dec->cconv, &len);
+        if (len == 0) {
+            return false;
+        }
+    }
+
+    *linep = line;
+    *lenp = len;
+    return true;
+}
+
+static bool set_encoding(FileDecoder *dec, const char *encoding)
+{
+    if (strcmp(encoding, "UTF-8") == 0) {
+        dec->read_line = read_utf8_line;
+    } else {
+        dec->cconv = cconv_to_utf8(encoding);
+        if (!dec->cconv) {
+            return false;
+        }
+        dec->read_line = decode_and_read_line;
+    }
+    dec->encoding = str_intern(encoding);
+    return true;
+}
+
+FileDecoder *new_file_decoder (
+    const char *encoding,
+    const unsigned char *buf,
+    size_t size
+) {
+    FileDecoder *dec = xnew0(FileDecoder, 1);
+    dec->ibuf = buf;
+    dec->isize = size;
+
+    if (!encoding) {
+        encoding = "UTF-8";
+    }
+
+    if (!set_encoding(dec, encoding)) {
+        free_file_decoder(dec);
+        return NULL;
+    }
+
+    return dec;
+}
+
+void free_file_decoder(FileDecoder *dec)
+{
+    if (dec->cconv) {
+        cconv_free(dec->cconv);
+    }
+    free(dec);
+}
+
+bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+    return dec->read_line(dec, linep, lenp);
+}
+
+#endif
author	Mitja Felicijan <mitja.felicijan@gmail.com>	2023-11-09 23:19:53 +0100
committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2023-11-09 23:19:53 +0100
commit	1566b6faa8534118c3566188181367cd0868468f (patch)
tree	1de8d4b369efb5e592685a31088f798a6b63ffa1 /examples/dte/convert.c
parent	349991bf6efe473ab9a5cbdae0a8114d72b997e3 (diff)
download	crep-1566b6faa8534118c3566188181367cd0868468f.tar.gz