summaryrefslogtreecommitdiff
path: root/examples/dte/convert.c
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2023-11-09 23:19:53 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2023-11-09 23:19:53 +0100
commit1566b6faa8534118c3566188181367cd0868468f (patch)
tree1de8d4b369efb5e592685a31088f798a6b63ffa1 /examples/dte/convert.c
parent349991bf6efe473ab9a5cbdae0a8114d72b997e3 (diff)
downloadcrep-1566b6faa8534118c3566188181367cd0868468f.tar.gz
Added partial matching and introduced threads
Diffstat (limited to 'examples/dte/convert.c')
-rw-r--r--examples/dte/convert.c581
1 files changed, 581 insertions, 0 deletions
diff --git a/examples/dte/convert.c b/examples/dte/convert.c
new file mode 100644
index 0000000..2020ee9
--- /dev/null
+++ b/examples/dte/convert.c
@@ -0,0 +1,581 @@
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include "convert.h"
+#include "util/debug.h"
+#include "util/intern.h"
+#include "util/log.h"
+#include "util/str-util.h"
+#include "util/utf8.h"
+#include "util/xmalloc.h"
+#include "util/xreadwrite.h"
+
+struct FileEncoder {
+ struct cconv *cconv;
+ unsigned char *nbuf;
+ size_t nsize;
+ bool crlf;
+ int fd;
+};
+
+struct FileDecoder {
+ const char *encoding;
+ const unsigned char *ibuf;
+ ssize_t ipos, isize;
+ struct cconv *cconv;
+ bool (*read_line)(struct FileDecoder *dec, const char **linep, size_t *lenp);
+};
+
+const char *file_decoder_get_encoding(const FileDecoder *dec)
+{
+ return dec->encoding;
+}
+
+static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+ const char *line = dec->ibuf + dec->ipos;
+ const char *nl = memchr(line, '\n', dec->isize - dec->ipos);
+ size_t len;
+
+ if (nl) {
+ len = nl - line;
+ dec->ipos += len + 1;
+ } else {
+ len = dec->isize - dec->ipos;
+ if (len == 0) {
+ return false;
+ }
+ dec->ipos += len;
+ }
+
+ *linep = line;
+ *lenp = len;
+ return true;
+}
+
+static size_t unix_to_dos (
+ FileEncoder *enc,
+ const unsigned char *buf,
+ size_t size
+) {
+ if (enc->nsize < size * 2) {
+ enc->nsize = size * 2;
+ xrenew(enc->nbuf, enc->nsize);
+ }
+ size_t d = 0;
+ for (size_t s = 0; s < size; s++) {
+ unsigned char ch = buf[s];
+ if (ch == '\n') {
+ enc->nbuf[d++] = '\r';
+ }
+ enc->nbuf[d++] = ch;
+ }
+ return d;
+}
+
+#ifdef ICONV_DISABLE // iconv not available; use basic, UTF-8 implementation:
+
+bool conversion_supported_by_iconv (
+ const char* UNUSED_ARG(from),
+ const char* UNUSED_ARG(to)
+) {
+ errno = EINVAL;
+ return false;
+}
+
+FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd)
+{
+ if (unlikely(encoding->type != UTF8)) {
+ errno = EINVAL;
+ return NULL;
+ }
+ FileEncoder *enc = xnew0(FileEncoder, 1);
+ enc->crlf = crlf;
+ enc->fd = fd;
+ return enc;
+}
+
+void free_file_encoder(FileEncoder *enc)
+{
+ free(enc->nbuf);
+ free(enc);
+}
+
+ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n)
+{
+ if (enc->crlf) {
+ n = unix_to_dos(enc, buf, n);
+ buf = enc->nbuf;
+ }
+ return xwrite_all(enc->fd, buf, n);
+}
+
+size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
+{
+ return 0;
+}
+
+FileDecoder *new_file_decoder(const char *encoding, const unsigned char *buf, size_t n)
+{
+ if (unlikely(encoding && !streq(encoding, "UTF-8"))) {
+ errno = EINVAL;
+ return NULL;
+ }
+ FileDecoder *dec = xnew0(FileDecoder, 1);
+ dec->ibuf = buf;
+ dec->isize = n;
+ return dec;
+}
+
+void free_file_decoder(FileDecoder *dec)
+{
+ free(dec);
+}
+
+bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+ return read_utf8_line(dec, linep, lenp);
+}
+
+#else // ICONV_DISABLE is undefined; use full iconv implementation:
+
+#include <iconv.h>
+
+static const unsigned char replacement[2] = "\xc2\xbf"; // U+00BF
+
+struct cconv {
+ iconv_t cd;
+ char *obuf;
+ size_t osize;
+ size_t opos;
+ size_t consumed;
+ size_t errors;
+
+ // Temporary input buffer
+ char tbuf[16];
+ size_t tcount;
+
+ // Replacement character 0xBF (inverted question mark)
+ char rbuf[4];
+ size_t rcount;
+
+ // Input character size in bytes, or zero for UTF-8
+ size_t char_size;
+};
+
+static struct cconv *create(iconv_t cd)
+{
+ struct cconv *c = xnew0(struct cconv, 1);
+ c->cd = cd;
+ c->osize = 8192;
+ c->obuf = xmalloc(c->osize);
+ return c;
+}
+
+static size_t encoding_char_size(const char *encoding)
+{
+ if (str_has_prefix(encoding, "UTF-16")) {
+ return 2;
+ }
+ if (str_has_prefix(encoding, "UTF-32")) {
+ return 4;
+ }
+ return 1;
+}
+
+static size_t iconv_wrapper (
+ iconv_t cd,
+ const char **restrict inbuf,
+ size_t *restrict inbytesleft,
+ char **restrict outbuf,
+ size_t *restrict outbytesleft
+) {
+ // POSIX defines the second parameter of iconv(3) as "char **restrict"
+ // but NetBSD declares it as "const char **restrict"
+#ifdef __NetBSD__
+ const char **restrict in = inbuf;
+#else
+ char **restrict in = (char **restrict)inbuf;
+#endif
+
+ return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
+}
+
+static void encode_replacement(struct cconv *c)
+{
+ const char *ib = replacement;
+ char *ob = c->rbuf;
+ size_t ic = sizeof(replacement);
+ size_t oc = sizeof(c->rbuf);
+ size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
+
+ if (rc == (size_t)-1) {
+ c->rbuf[0] = '\xbf';
+ c->rcount = 1;
+ } else {
+ c->rcount = ob - c->rbuf;
+ }
+}
+
+static void resize_obuf(struct cconv *c)
+{
+ c->osize *= 2;
+ xrenew(c->obuf, c->osize);
+}
+
+static void add_replacement(struct cconv *c)
+{
+ if (c->osize - c->opos < 4) {
+ resize_obuf(c);
+ }
+
+ memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
+ c->opos += c->rcount;
+}
+
+static size_t handle_invalid(struct cconv *c, const char *buf, size_t count)
+{
+ LOG_DEBUG("%zu %zu", c->char_size, count);
+ add_replacement(c);
+ if (c->char_size == 0) {
+ // Converting from UTF-8
+ size_t idx = 0;
+ CodePoint u = u_get_char(buf, count, &idx);
+ LOG_DEBUG("U+%04" PRIX32, u);
+ return idx;
+ }
+ if (c->char_size > count) {
+ // wtf
+ return 1;
+ }
+ return c->char_size;
+}
+
+static int xiconv(struct cconv *c, const char **ib, size_t *ic)
+{
+ while (1) {
+ char *ob = c->obuf + c->opos;
+ size_t oc = c->osize - c->opos;
+ size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
+ c->opos = ob - c->obuf;
+ if (rc == (size_t)-1) {
+ switch (errno) {
+ case EILSEQ:
+ c->errors++;
+ // Reset
+ iconv(c->cd, NULL, NULL, NULL, NULL);
+ return errno;
+ case EINVAL:
+ return errno;
+ case E2BIG:
+ resize_obuf(c);
+ continue;
+ default:
+ BUG("iconv: %s", strerror(errno));
+ }
+ } else {
+ c->errors += rc;
+ }
+ return 0;
+ }
+}
+
+static size_t convert_incomplete(struct cconv *c, const char *input, size_t len)
+{
+ size_t ipos = 0;
+ while (c->tcount < sizeof(c->tbuf) && ipos < len) {
+ c->tbuf[c->tcount++] = input[ipos++];
+ const char *ib = c->tbuf;
+ size_t ic = c->tcount;
+ int rc = xiconv(c, &ib, &ic);
+ if (ic > 0) {
+ memmove(c->tbuf, ib, ic);
+ }
+ c->tcount = ic;
+ if (rc == EINVAL) {
+ // Incomplete character at end of input buffer; try again
+ // with more input data
+ continue;
+ }
+ if (rc == EILSEQ) {
+ // Invalid multibyte sequence
+ size_t skip = handle_invalid(c, c->tbuf, c->tcount);
+ c->tcount -= skip;
+ if (c->tcount > 0) {
+ LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
+ memmove(c->tbuf, c->tbuf + skip, c->tcount);
+ continue;
+ }
+ return ipos;
+ }
+ break;
+ }
+
+ LOG_DEBUG("%zu %zu", ipos, c->tcount);
+ return ipos;
+}
+
+static void cconv_process(struct cconv *c, const char *input, size_t len)
+{
+ if (c->consumed > 0) {
+ size_t fill = c->opos - c->consumed;
+ memmove(c->obuf, c->obuf + c->consumed, fill);
+ c->opos = fill;
+ c->consumed = 0;
+ }
+
+ if (c->tcount > 0) {
+ size_t ipos = convert_incomplete(c, input, len);
+ input += ipos;
+ len -= ipos;
+ }
+
+ const char *ib = input;
+ for (size_t ic = len; ic > 0; ) {
+ int r = xiconv(c, &ib, &ic);
+ if (r == EINVAL) {
+ // Incomplete character at end of input buffer
+ if (ic < sizeof(c->tbuf)) {
+ memcpy(c->tbuf, ib, ic);
+ c->tcount = ic;
+ } else {
+ // FIXME
+ }
+ ic = 0;
+ continue;
+ }
+ if (r == EILSEQ) {
+ // Invalid multibyte sequence
+ size_t skip = handle_invalid(c, ib, ic);
+ ic -= skip;
+ ib += skip;
+ continue;
+ }
+ }
+}
+
+static struct cconv *cconv_to_utf8(const char *encoding)
+{
+ iconv_t cd = iconv_open("UTF-8", encoding);
+ if (cd == (iconv_t)-1) {
+ return NULL;
+ }
+ struct cconv *c = create(cd);
+ memcpy(c->rbuf, replacement, sizeof(replacement));
+ c->rcount = sizeof(replacement);
+ c->char_size = encoding_char_size(encoding);
+ return c;
+}
+
+static struct cconv *cconv_from_utf8(const char *encoding)
+{
+ iconv_t cd = iconv_open(encoding, "UTF-8");
+ if (cd == (iconv_t)-1) {
+ return NULL;
+ }
+ struct cconv *c = create(cd);
+ encode_replacement(c);
+ return c;
+}
+
+static void cconv_flush(struct cconv *c)
+{
+ if (c->tcount > 0) {
+ // Replace incomplete character at end of input buffer
+ LOG_DEBUG("incomplete character at EOF");
+ add_replacement(c);
+ c->tcount = 0;
+ }
+}
+
+static char *cconv_consume_line(struct cconv *c, size_t *len)
+{
+ char *line = c->obuf + c->consumed;
+ char *nl = memchr(line, '\n', c->opos - c->consumed);
+ if (!nl) {
+ *len = 0;
+ return NULL;
+ }
+
+ size_t n = nl - line + 1;
+ c->consumed += n;
+ *len = n;
+ return line;
+}
+
+static char *cconv_consume_all(struct cconv *c, size_t *len)
+{
+ char *buf = c->obuf + c->consumed;
+ *len = c->opos - c->consumed;
+ c->consumed = c->opos;
+ return buf;
+}
+
+static void cconv_free(struct cconv *c)
+{
+ iconv_close(c->cd);
+ free(c->obuf);
+ free(c);
+}
+
+bool conversion_supported_by_iconv(const char *from, const char *to)
+{
+ if (unlikely(from[0] == '\0' || to[0] == '\0')) {
+ errno = EINVAL;
+ return false;
+ }
+
+ iconv_t cd = iconv_open(to, from);
+ if (cd == (iconv_t)-1) {
+ return false;
+ }
+
+ iconv_close(cd);
+ return true;
+}
+
+FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd)
+{
+ FileEncoder *enc = xnew0(FileEncoder, 1);
+ enc->crlf = crlf;
+ enc->fd = fd;
+
+ if (encoding->type != UTF8) {
+ enc->cconv = cconv_from_utf8(encoding->name);
+ if (!enc->cconv) {
+ free(enc);
+ return NULL;
+ }
+ }
+
+ return enc;
+}
+
+void free_file_encoder(FileEncoder *enc)
+{
+ if (enc->cconv) {
+ cconv_free(enc->cconv);
+ }
+ free(enc->nbuf);
+ free(enc);
+}
+
+// NOTE: buf must contain whole characters!
+ssize_t file_encoder_write (
+ FileEncoder *enc,
+ const unsigned char *buf,
+ size_t size
+) {
+ if (enc->crlf) {
+ size = unix_to_dos(enc, buf, size);
+ buf = enc->nbuf;
+ }
+ if (enc->cconv) {
+ cconv_process(enc->cconv, buf, size);
+ cconv_flush(enc->cconv);
+ buf = cconv_consume_all(enc->cconv, &size);
+ }
+ return xwrite_all(enc->fd, buf, size);
+}
+
+size_t file_encoder_get_nr_errors(const FileEncoder *enc)
+{
+ return enc->cconv ? enc->cconv->errors : 0;
+}
+
+static bool fill(FileDecoder *dec)
+{
+ if (dec->ipos == dec->isize) {
+ return false;
+ }
+
+ // Smaller than cconv.obuf to make realloc less likely
+ size_t max = 7 * 1024;
+
+ size_t icount = MIN(dec->isize - dec->ipos, max);
+ cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount);
+ dec->ipos += icount;
+ if (dec->ipos == dec->isize) {
+ // Must be flushed after all input has been fed
+ cconv_flush(dec->cconv);
+ }
+ return true;
+}
+
+static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+ char *line;
+ size_t len;
+ while (1) {
+ line = cconv_consume_line(dec->cconv, &len);
+ if (line || !fill(dec)) {
+ break;
+ }
+ }
+
+ if (line) {
+ // Newline not wanted
+ len--;
+ } else {
+ line = cconv_consume_all(dec->cconv, &len);
+ if (len == 0) {
+ return false;
+ }
+ }
+
+ *linep = line;
+ *lenp = len;
+ return true;
+}
+
+static bool set_encoding(FileDecoder *dec, const char *encoding)
+{
+ if (strcmp(encoding, "UTF-8") == 0) {
+ dec->read_line = read_utf8_line;
+ } else {
+ dec->cconv = cconv_to_utf8(encoding);
+ if (!dec->cconv) {
+ return false;
+ }
+ dec->read_line = decode_and_read_line;
+ }
+ dec->encoding = str_intern(encoding);
+ return true;
+}
+
+FileDecoder *new_file_decoder (
+ const char *encoding,
+ const unsigned char *buf,
+ size_t size
+) {
+ FileDecoder *dec = xnew0(FileDecoder, 1);
+ dec->ibuf = buf;
+ dec->isize = size;
+
+ if (!encoding) {
+ encoding = "UTF-8";
+ }
+
+ if (!set_encoding(dec, encoding)) {
+ free_file_decoder(dec);
+ return NULL;
+ }
+
+ return dec;
+}
+
+void free_file_decoder(FileDecoder *dec)
+{
+ if (dec->cconv) {
+ cconv_free(dec->cconv);
+ }
+ free(dec);
+}
+
+bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
+{
+ return dec->read_line(dec, linep, lenp);
+}
+
+#endif