summaryrefslogtreecommitdiff
path: root/examples/dte/convert.c
diff options
context:
space:
mode:
Diffstat (limited to 'examples/dte/convert.c')
-rw-r--r--examples/dte/convert.c581
1 files changed, 0 insertions, 581 deletions
diff --git a/examples/dte/convert.c b/examples/dte/convert.c
deleted file mode 100644
index 2020ee9..0000000
--- a/examples/dte/convert.c
+++ /dev/null
@@ -1,581 +0,0 @@
-#include <errno.h>
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include "convert.h"
-#include "util/debug.h"
-#include "util/intern.h"
-#include "util/log.h"
-#include "util/str-util.h"
-#include "util/utf8.h"
-#include "util/xmalloc.h"
-#include "util/xreadwrite.h"
-
-struct FileEncoder {
- struct cconv *cconv;
- unsigned char *nbuf;
- size_t nsize;
- bool crlf;
- int fd;
-};
-
-struct FileDecoder {
- const char *encoding;
- const unsigned char *ibuf;
- ssize_t ipos, isize;
- struct cconv *cconv;
- bool (*read_line)(struct FileDecoder *dec, const char **linep, size_t *lenp);
-};
-
-const char *file_decoder_get_encoding(const FileDecoder *dec)
-{
- return dec->encoding;
-}
-
-static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp)
-{
- const char *line = dec->ibuf + dec->ipos;
- const char *nl = memchr(line, '\n', dec->isize - dec->ipos);
- size_t len;
-
- if (nl) {
- len = nl - line;
- dec->ipos += len + 1;
- } else {
- len = dec->isize - dec->ipos;
- if (len == 0) {
- return false;
- }
- dec->ipos += len;
- }
-
- *linep = line;
- *lenp = len;
- return true;
-}
-
-static size_t unix_to_dos (
- FileEncoder *enc,
- const unsigned char *buf,
- size_t size
-) {
- if (enc->nsize < size * 2) {
- enc->nsize = size * 2;
- xrenew(enc->nbuf, enc->nsize);
- }
- size_t d = 0;
- for (size_t s = 0; s < size; s++) {
- unsigned char ch = buf[s];
- if (ch == '\n') {
- enc->nbuf[d++] = '\r';
- }
- enc->nbuf[d++] = ch;
- }
- return d;
-}
-
-#ifdef ICONV_DISABLE // iconv not available; use basic, UTF-8 implementation:
-
-bool conversion_supported_by_iconv (
- const char* UNUSED_ARG(from),
- const char* UNUSED_ARG(to)
-) {
- errno = EINVAL;
- return false;
-}
-
-FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd)
-{
- if (unlikely(encoding->type != UTF8)) {
- errno = EINVAL;
- return NULL;
- }
- FileEncoder *enc = xnew0(FileEncoder, 1);
- enc->crlf = crlf;
- enc->fd = fd;
- return enc;
-}
-
-void free_file_encoder(FileEncoder *enc)
-{
- free(enc->nbuf);
- free(enc);
-}
-
-ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n)
-{
- if (enc->crlf) {
- n = unix_to_dos(enc, buf, n);
- buf = enc->nbuf;
- }
- return xwrite_all(enc->fd, buf, n);
-}
-
-size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
-{
- return 0;
-}
-
-FileDecoder *new_file_decoder(const char *encoding, const unsigned char *buf, size_t n)
-{
- if (unlikely(encoding && !streq(encoding, "UTF-8"))) {
- errno = EINVAL;
- return NULL;
- }
- FileDecoder *dec = xnew0(FileDecoder, 1);
- dec->ibuf = buf;
- dec->isize = n;
- return dec;
-}
-
-void free_file_decoder(FileDecoder *dec)
-{
- free(dec);
-}
-
-bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
-{
- return read_utf8_line(dec, linep, lenp);
-}
-
-#else // ICONV_DISABLE is undefined; use full iconv implementation:
-
-#include <iconv.h>
-
-static const unsigned char replacement[2] = "\xc2\xbf"; // U+00BF
-
-struct cconv {
- iconv_t cd;
- char *obuf;
- size_t osize;
- size_t opos;
- size_t consumed;
- size_t errors;
-
- // Temporary input buffer
- char tbuf[16];
- size_t tcount;
-
- // Replacement character 0xBF (inverted question mark)
- char rbuf[4];
- size_t rcount;
-
- // Input character size in bytes, or zero for UTF-8
- size_t char_size;
-};
-
-static struct cconv *create(iconv_t cd)
-{
- struct cconv *c = xnew0(struct cconv, 1);
- c->cd = cd;
- c->osize = 8192;
- c->obuf = xmalloc(c->osize);
- return c;
-}
-
-static size_t encoding_char_size(const char *encoding)
-{
- if (str_has_prefix(encoding, "UTF-16")) {
- return 2;
- }
- if (str_has_prefix(encoding, "UTF-32")) {
- return 4;
- }
- return 1;
-}
-
-static size_t iconv_wrapper (
- iconv_t cd,
- const char **restrict inbuf,
- size_t *restrict inbytesleft,
- char **restrict outbuf,
- size_t *restrict outbytesleft
-) {
- // POSIX defines the second parameter of iconv(3) as "char **restrict"
- // but NetBSD declares it as "const char **restrict"
-#ifdef __NetBSD__
- const char **restrict in = inbuf;
-#else
- char **restrict in = (char **restrict)inbuf;
-#endif
-
- return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
-}
-
-static void encode_replacement(struct cconv *c)
-{
- const char *ib = replacement;
- char *ob = c->rbuf;
- size_t ic = sizeof(replacement);
- size_t oc = sizeof(c->rbuf);
- size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
-
- if (rc == (size_t)-1) {
- c->rbuf[0] = '\xbf';
- c->rcount = 1;
- } else {
- c->rcount = ob - c->rbuf;
- }
-}
-
-static void resize_obuf(struct cconv *c)
-{
- c->osize *= 2;
- xrenew(c->obuf, c->osize);
-}
-
-static void add_replacement(struct cconv *c)
-{
- if (c->osize - c->opos < 4) {
- resize_obuf(c);
- }
-
- memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
- c->opos += c->rcount;
-}
-
-static size_t handle_invalid(struct cconv *c, const char *buf, size_t count)
-{
- LOG_DEBUG("%zu %zu", c->char_size, count);
- add_replacement(c);
- if (c->char_size == 0) {
- // Converting from UTF-8
- size_t idx = 0;
- CodePoint u = u_get_char(buf, count, &idx);
- LOG_DEBUG("U+%04" PRIX32, u);
- return idx;
- }
- if (c->char_size > count) {
- // wtf
- return 1;
- }
- return c->char_size;
-}
-
-static int xiconv(struct cconv *c, const char **ib, size_t *ic)
-{
- while (1) {
- char *ob = c->obuf + c->opos;
- size_t oc = c->osize - c->opos;
- size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
- c->opos = ob - c->obuf;
- if (rc == (size_t)-1) {
- switch (errno) {
- case EILSEQ:
- c->errors++;
- // Reset
- iconv(c->cd, NULL, NULL, NULL, NULL);
- return errno;
- case EINVAL:
- return errno;
- case E2BIG:
- resize_obuf(c);
- continue;
- default:
- BUG("iconv: %s", strerror(errno));
- }
- } else {
- c->errors += rc;
- }
- return 0;
- }
-}
-
-static size_t convert_incomplete(struct cconv *c, const char *input, size_t len)
-{
- size_t ipos = 0;
- while (c->tcount < sizeof(c->tbuf) && ipos < len) {
- c->tbuf[c->tcount++] = input[ipos++];
- const char *ib = c->tbuf;
- size_t ic = c->tcount;
- int rc = xiconv(c, &ib, &ic);
- if (ic > 0) {
- memmove(c->tbuf, ib, ic);
- }
- c->tcount = ic;
- if (rc == EINVAL) {
- // Incomplete character at end of input buffer; try again
- // with more input data
- continue;
- }
- if (rc == EILSEQ) {
- // Invalid multibyte sequence
- size_t skip = handle_invalid(c, c->tbuf, c->tcount);
- c->tcount -= skip;
- if (c->tcount > 0) {
- LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
- memmove(c->tbuf, c->tbuf + skip, c->tcount);
- continue;
- }
- return ipos;
- }
- break;
- }
-
- LOG_DEBUG("%zu %zu", ipos, c->tcount);
- return ipos;
-}
-
-static void cconv_process(struct cconv *c, const char *input, size_t len)
-{
- if (c->consumed > 0) {
- size_t fill = c->opos - c->consumed;
- memmove(c->obuf, c->obuf + c->consumed, fill);
- c->opos = fill;
- c->consumed = 0;
- }
-
- if (c->tcount > 0) {
- size_t ipos = convert_incomplete(c, input, len);
- input += ipos;
- len -= ipos;
- }
-
- const char *ib = input;
- for (size_t ic = len; ic > 0; ) {
- int r = xiconv(c, &ib, &ic);
- if (r == EINVAL) {
- // Incomplete character at end of input buffer
- if (ic < sizeof(c->tbuf)) {
- memcpy(c->tbuf, ib, ic);
- c->tcount = ic;
- } else {
- // FIXME
- }
- ic = 0;
- continue;
- }
- if (r == EILSEQ) {
- // Invalid multibyte sequence
- size_t skip = handle_invalid(c, ib, ic);
- ic -= skip;
- ib += skip;
- continue;
- }
- }
-}
-
-static struct cconv *cconv_to_utf8(const char *encoding)
-{
- iconv_t cd = iconv_open("UTF-8", encoding);
- if (cd == (iconv_t)-1) {
- return NULL;
- }
- struct cconv *c = create(cd);
- memcpy(c->rbuf, replacement, sizeof(replacement));
- c->rcount = sizeof(replacement);
- c->char_size = encoding_char_size(encoding);
- return c;
-}
-
-static struct cconv *cconv_from_utf8(const char *encoding)
-{
- iconv_t cd = iconv_open(encoding, "UTF-8");
- if (cd == (iconv_t)-1) {
- return NULL;
- }
- struct cconv *c = create(cd);
- encode_replacement(c);
- return c;
-}
-
-static void cconv_flush(struct cconv *c)
-{
- if (c->tcount > 0) {
- // Replace incomplete character at end of input buffer
- LOG_DEBUG("incomplete character at EOF");
- add_replacement(c);
- c->tcount = 0;
- }
-}
-
-static char *cconv_consume_line(struct cconv *c, size_t *len)
-{
- char *line = c->obuf + c->consumed;
- char *nl = memchr(line, '\n', c->opos - c->consumed);
- if (!nl) {
- *len = 0;
- return NULL;
- }
-
- size_t n = nl - line + 1;
- c->consumed += n;
- *len = n;
- return line;
-}
-
-static char *cconv_consume_all(struct cconv *c, size_t *len)
-{
- char *buf = c->obuf + c->consumed;
- *len = c->opos - c->consumed;
- c->consumed = c->opos;
- return buf;
-}
-
-static void cconv_free(struct cconv *c)
-{
- iconv_close(c->cd);
- free(c->obuf);
- free(c);
-}
-
-bool conversion_supported_by_iconv(const char *from, const char *to)
-{
- if (unlikely(from[0] == '\0' || to[0] == '\0')) {
- errno = EINVAL;
- return false;
- }
-
- iconv_t cd = iconv_open(to, from);
- if (cd == (iconv_t)-1) {
- return false;
- }
-
- iconv_close(cd);
- return true;
-}
-
-FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd)
-{
- FileEncoder *enc = xnew0(FileEncoder, 1);
- enc->crlf = crlf;
- enc->fd = fd;
-
- if (encoding->type != UTF8) {
- enc->cconv = cconv_from_utf8(encoding->name);
- if (!enc->cconv) {
- free(enc);
- return NULL;
- }
- }
-
- return enc;
-}
-
-void free_file_encoder(FileEncoder *enc)
-{
- if (enc->cconv) {
- cconv_free(enc->cconv);
- }
- free(enc->nbuf);
- free(enc);
-}
-
-// NOTE: buf must contain whole characters!
-ssize_t file_encoder_write (
- FileEncoder *enc,
- const unsigned char *buf,
- size_t size
-) {
- if (enc->crlf) {
- size = unix_to_dos(enc, buf, size);
- buf = enc->nbuf;
- }
- if (enc->cconv) {
- cconv_process(enc->cconv, buf, size);
- cconv_flush(enc->cconv);
- buf = cconv_consume_all(enc->cconv, &size);
- }
- return xwrite_all(enc->fd, buf, size);
-}
-
-size_t file_encoder_get_nr_errors(const FileEncoder *enc)
-{
- return enc->cconv ? enc->cconv->errors : 0;
-}
-
-static bool fill(FileDecoder *dec)
-{
- if (dec->ipos == dec->isize) {
- return false;
- }
-
- // Smaller than cconv.obuf to make realloc less likely
- size_t max = 7 * 1024;
-
- size_t icount = MIN(dec->isize - dec->ipos, max);
- cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount);
- dec->ipos += icount;
- if (dec->ipos == dec->isize) {
- // Must be flushed after all input has been fed
- cconv_flush(dec->cconv);
- }
- return true;
-}
-
-static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
-{
- char *line;
- size_t len;
- while (1) {
- line = cconv_consume_line(dec->cconv, &len);
- if (line || !fill(dec)) {
- break;
- }
- }
-
- if (line) {
- // Newline not wanted
- len--;
- } else {
- line = cconv_consume_all(dec->cconv, &len);
- if (len == 0) {
- return false;
- }
- }
-
- *linep = line;
- *lenp = len;
- return true;
-}
-
-static bool set_encoding(FileDecoder *dec, const char *encoding)
-{
- if (strcmp(encoding, "UTF-8") == 0) {
- dec->read_line = read_utf8_line;
- } else {
- dec->cconv = cconv_to_utf8(encoding);
- if (!dec->cconv) {
- return false;
- }
- dec->read_line = decode_and_read_line;
- }
- dec->encoding = str_intern(encoding);
- return true;
-}
-
-FileDecoder *new_file_decoder (
- const char *encoding,
- const unsigned char *buf,
- size_t size
-) {
- FileDecoder *dec = xnew0(FileDecoder, 1);
- dec->ibuf = buf;
- dec->isize = size;
-
- if (!encoding) {
- encoding = "UTF-8";
- }
-
- if (!set_encoding(dec, encoding)) {
- free_file_decoder(dec);
- return NULL;
- }
-
- return dec;
-}
-
-void free_file_decoder(FileDecoder *dec)
-{
- if (dec->cconv) {
- cconv_free(dec->cconv);
- }
- free(dec);
-}
-
-bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
-{
- return dec->read_line(dec, linep, lenp);
-}
-
-#endif