diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2023-11-09 23:19:53 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2023-11-09 23:19:53 +0100 |
| commit | 1566b6faa8534118c3566188181367cd0868468f (patch) | |
| tree | 1de8d4b369efb5e592685a31088f798a6b63ffa1 /examples/dte/convert.c | |
| parent | 349991bf6efe473ab9a5cbdae0a8114d72b997e3 (diff) | |
| download | crep-1566b6faa8534118c3566188181367cd0868468f.tar.gz | |
Added partial matching and introduced threads
Diffstat (limited to 'examples/dte/convert.c')
| -rw-r--r-- | examples/dte/convert.c | 581 |
1 files changed, 581 insertions, 0 deletions
diff --git a/examples/dte/convert.c b/examples/dte/convert.c new file mode 100644 index 0000000..2020ee9 --- /dev/null +++ b/examples/dte/convert.c @@ -0,0 +1,581 @@ +#include <errno.h> +#include <inttypes.h> +#include <stdlib.h> +#include <string.h> +#include "convert.h" +#include "util/debug.h" +#include "util/intern.h" +#include "util/log.h" +#include "util/str-util.h" +#include "util/utf8.h" +#include "util/xmalloc.h" +#include "util/xreadwrite.h" + +struct FileEncoder { + struct cconv *cconv; + unsigned char *nbuf; + size_t nsize; + bool crlf; + int fd; +}; + +struct FileDecoder { + const char *encoding; + const unsigned char *ibuf; + ssize_t ipos, isize; + struct cconv *cconv; + bool (*read_line)(struct FileDecoder *dec, const char **linep, size_t *lenp); +}; + +const char *file_decoder_get_encoding(const FileDecoder *dec) +{ + return dec->encoding; +} + +static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp) +{ + const char *line = dec->ibuf + dec->ipos; + const char *nl = memchr(line, '\n', dec->isize - dec->ipos); + size_t len; + + if (nl) { + len = nl - line; + dec->ipos += len + 1; + } else { + len = dec->isize - dec->ipos; + if (len == 0) { + return false; + } + dec->ipos += len; + } + + *linep = line; + *lenp = len; + return true; +} + +static size_t unix_to_dos ( + FileEncoder *enc, + const unsigned char *buf, + size_t size +) { + if (enc->nsize < size * 2) { + enc->nsize = size * 2; + xrenew(enc->nbuf, enc->nsize); + } + size_t d = 0; + for (size_t s = 0; s < size; s++) { + unsigned char ch = buf[s]; + if (ch == '\n') { + enc->nbuf[d++] = '\r'; + } + enc->nbuf[d++] = ch; + } + return d; +} + +#ifdef ICONV_DISABLE // iconv not available; use basic, UTF-8 implementation: + +bool conversion_supported_by_iconv ( + const char* UNUSED_ARG(from), + const char* UNUSED_ARG(to) +) { + errno = EINVAL; + return false; +} + +FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd) +{ + if (unlikely(encoding->type != UTF8)) { + errno = EINVAL; + return NULL; + } + FileEncoder *enc = xnew0(FileEncoder, 1); + enc->crlf = crlf; + enc->fd = fd; + return enc; +} + +void free_file_encoder(FileEncoder *enc) +{ + free(enc->nbuf); + free(enc); +} + +ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n) +{ + if (enc->crlf) { + n = unix_to_dos(enc, buf, n); + buf = enc->nbuf; + } + return xwrite_all(enc->fd, buf, n); +} + +size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc)) +{ + return 0; +} + +FileDecoder *new_file_decoder(const char *encoding, const unsigned char *buf, size_t n) +{ + if (unlikely(encoding && !streq(encoding, "UTF-8"))) { + errno = EINVAL; + return NULL; + } + FileDecoder *dec = xnew0(FileDecoder, 1); + dec->ibuf = buf; + dec->isize = n; + return dec; +} + +void free_file_decoder(FileDecoder *dec) +{ + free(dec); +} + +bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp) +{ + return read_utf8_line(dec, linep, lenp); +} + +#else // ICONV_DISABLE is undefined; use full iconv implementation: + +#include <iconv.h> + +static const unsigned char replacement[2] = "\xc2\xbf"; // U+00BF + +struct cconv { + iconv_t cd; + char *obuf; + size_t osize; + size_t opos; + size_t consumed; + size_t errors; + + // Temporary input buffer + char tbuf[16]; + size_t tcount; + + // Replacement character 0xBF (inverted question mark) + char rbuf[4]; + size_t rcount; + + // Input character size in bytes, or zero for UTF-8 + size_t char_size; +}; + +static struct cconv *create(iconv_t cd) +{ + struct cconv *c = xnew0(struct cconv, 1); + c->cd = cd; + c->osize = 8192; + c->obuf = xmalloc(c->osize); + return c; +} + +static size_t encoding_char_size(const char *encoding) +{ + if (str_has_prefix(encoding, "UTF-16")) { + return 2; + } + if (str_has_prefix(encoding, "UTF-32")) { + return 4; + } + return 1; +} + +static size_t iconv_wrapper ( + iconv_t cd, + const char **restrict inbuf, + size_t *restrict inbytesleft, + char **restrict outbuf, + size_t *restrict outbytesleft +) { + // POSIX defines the second parameter of iconv(3) as "char **restrict" + // but NetBSD declares it as "const char **restrict" +#ifdef __NetBSD__ + const char **restrict in = inbuf; +#else + char **restrict in = (char **restrict)inbuf; +#endif + + return iconv(cd, in, inbytesleft, outbuf, outbytesleft); +} + +static void encode_replacement(struct cconv *c) +{ + const char *ib = replacement; + char *ob = c->rbuf; + size_t ic = sizeof(replacement); + size_t oc = sizeof(c->rbuf); + size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc); + + if (rc == (size_t)-1) { + c->rbuf[0] = '\xbf'; + c->rcount = 1; + } else { + c->rcount = ob - c->rbuf; + } +} + +static void resize_obuf(struct cconv *c) +{ + c->osize *= 2; + xrenew(c->obuf, c->osize); +} + +static void add_replacement(struct cconv *c) +{ + if (c->osize - c->opos < 4) { + resize_obuf(c); + } + + memcpy(c->obuf + c->opos, c->rbuf, c->rcount); + c->opos += c->rcount; +} + +static size_t handle_invalid(struct cconv *c, const char *buf, size_t count) +{ + LOG_DEBUG("%zu %zu", c->char_size, count); + add_replacement(c); + if (c->char_size == 0) { + // Converting from UTF-8 + size_t idx = 0; + CodePoint u = u_get_char(buf, count, &idx); + LOG_DEBUG("U+%04" PRIX32, u); + return idx; + } + if (c->char_size > count) { + // wtf + return 1; + } + return c->char_size; +} + +static int xiconv(struct cconv *c, const char **ib, size_t *ic) +{ + while (1) { + char *ob = c->obuf + c->opos; + size_t oc = c->osize - c->opos; + size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc); + c->opos = ob - c->obuf; + if (rc == (size_t)-1) { + switch (errno) { + case EILSEQ: + c->errors++; + // Reset + iconv(c->cd, NULL, NULL, NULL, NULL); + return errno; + case EINVAL: + return errno; + case E2BIG: + resize_obuf(c); + continue; + default: + BUG("iconv: %s", strerror(errno)); + } + } else { + c->errors += rc; + } + return 0; + } +} + +static size_t convert_incomplete(struct cconv *c, const char *input, size_t len) +{ + size_t ipos = 0; + while (c->tcount < sizeof(c->tbuf) && ipos < len) { + c->tbuf[c->tcount++] = input[ipos++]; + const char *ib = c->tbuf; + size_t ic = c->tcount; + int rc = xiconv(c, &ib, &ic); + if (ic > 0) { + memmove(c->tbuf, ib, ic); + } + c->tcount = ic; + if (rc == EINVAL) { + // Incomplete character at end of input buffer; try again + // with more input data + continue; + } + if (rc == EILSEQ) { + // Invalid multibyte sequence + size_t skip = handle_invalid(c, c->tbuf, c->tcount); + c->tcount -= skip; + if (c->tcount > 0) { + LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip); + memmove(c->tbuf, c->tbuf + skip, c->tcount); + continue; + } + return ipos; + } + break; + } + + LOG_DEBUG("%zu %zu", ipos, c->tcount); + return ipos; +} + +static void cconv_process(struct cconv *c, const char *input, size_t len) +{ + if (c->consumed > 0) { + size_t fill = c->opos - c->consumed; + memmove(c->obuf, c->obuf + c->consumed, fill); + c->opos = fill; + c->consumed = 0; + } + + if (c->tcount > 0) { + size_t ipos = convert_incomplete(c, input, len); + input += ipos; + len -= ipos; + } + + const char *ib = input; + for (size_t ic = len; ic > 0; ) { + int r = xiconv(c, &ib, &ic); + if (r == EINVAL) { + // Incomplete character at end of input buffer + if (ic < sizeof(c->tbuf)) { + memcpy(c->tbuf, ib, ic); + c->tcount = ic; + } else { + // FIXME + } + ic = 0; + continue; + } + if (r == EILSEQ) { + // Invalid multibyte sequence + size_t skip = handle_invalid(c, ib, ic); + ic -= skip; + ib += skip; + continue; + } + } +} + +static struct cconv *cconv_to_utf8(const char *encoding) +{ + iconv_t cd = iconv_open("UTF-8", encoding); + if (cd == (iconv_t)-1) { + return NULL; + } + struct cconv *c = create(cd); + memcpy(c->rbuf, replacement, sizeof(replacement)); + c->rcount = sizeof(replacement); + c->char_size = encoding_char_size(encoding); + return c; +} + +static struct cconv *cconv_from_utf8(const char *encoding) +{ + iconv_t cd = iconv_open(encoding, "UTF-8"); + if (cd == (iconv_t)-1) { + return NULL; + } + struct cconv *c = create(cd); + encode_replacement(c); + return c; +} + +static void cconv_flush(struct cconv *c) +{ + if (c->tcount > 0) { + // Replace incomplete character at end of input buffer + LOG_DEBUG("incomplete character at EOF"); + add_replacement(c); + c->tcount = 0; + } +} + +static char *cconv_consume_line(struct cconv *c, size_t *len) +{ + char *line = c->obuf + c->consumed; + char *nl = memchr(line, '\n', c->opos - c->consumed); + if (!nl) { + *len = 0; + return NULL; + } + + size_t n = nl - line + 1; + c->consumed += n; + *len = n; + return line; +} + +static char *cconv_consume_all(struct cconv *c, size_t *len) +{ + char *buf = c->obuf + c->consumed; + *len = c->opos - c->consumed; + c->consumed = c->opos; + return buf; +} + +static void cconv_free(struct cconv *c) +{ + iconv_close(c->cd); + free(c->obuf); + free(c); +} + +bool conversion_supported_by_iconv(const char *from, const char *to) +{ + if (unlikely(from[0] == '\0' || to[0] == '\0')) { + errno = EINVAL; + return false; + } + + iconv_t cd = iconv_open(to, from); + if (cd == (iconv_t)-1) { + return false; + } + + iconv_close(cd); + return true; +} + +FileEncoder *new_file_encoder(const Encoding *encoding, bool crlf, int fd) +{ + FileEncoder *enc = xnew0(FileEncoder, 1); + enc->crlf = crlf; + enc->fd = fd; + + if (encoding->type != UTF8) { + enc->cconv = cconv_from_utf8(encoding->name); + if (!enc->cconv) { + free(enc); + return NULL; + } + } + + return enc; +} + +void free_file_encoder(FileEncoder *enc) +{ + if (enc->cconv) { + cconv_free(enc->cconv); + } + free(enc->nbuf); + free(enc); +} + +// NOTE: buf must contain whole characters! +ssize_t file_encoder_write ( + FileEncoder *enc, + const unsigned char *buf, + size_t size +) { + if (enc->crlf) { + size = unix_to_dos(enc, buf, size); + buf = enc->nbuf; + } + if (enc->cconv) { + cconv_process(enc->cconv, buf, size); + cconv_flush(enc->cconv); + buf = cconv_consume_all(enc->cconv, &size); + } + return xwrite_all(enc->fd, buf, size); +} + +size_t file_encoder_get_nr_errors(const FileEncoder *enc) +{ + return enc->cconv ? enc->cconv->errors : 0; +} + +static bool fill(FileDecoder *dec) +{ + if (dec->ipos == dec->isize) { + return false; + } + + // Smaller than cconv.obuf to make realloc less likely + size_t max = 7 * 1024; + + size_t icount = MIN(dec->isize - dec->ipos, max); + cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount); + dec->ipos += icount; + if (dec->ipos == dec->isize) { + // Must be flushed after all input has been fed + cconv_flush(dec->cconv); + } + return true; +} + +static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp) +{ + char *line; + size_t len; + while (1) { + line = cconv_consume_line(dec->cconv, &len); + if (line || !fill(dec)) { + break; + } + } + + if (line) { + // Newline not wanted + len--; + } else { + line = cconv_consume_all(dec->cconv, &len); + if (len == 0) { + return false; + } + } + + *linep = line; + *lenp = len; + return true; +} + +static bool set_encoding(FileDecoder *dec, const char *encoding) +{ + if (strcmp(encoding, "UTF-8") == 0) { + dec->read_line = read_utf8_line; + } else { + dec->cconv = cconv_to_utf8(encoding); + if (!dec->cconv) { + return false; + } + dec->read_line = decode_and_read_line; + } + dec->encoding = str_intern(encoding); + return true; +} + +FileDecoder *new_file_decoder ( + const char *encoding, + const unsigned char *buf, + size_t size +) { + FileDecoder *dec = xnew0(FileDecoder, 1); + dec->ibuf = buf; + dec->isize = size; + + if (!encoding) { + encoding = "UTF-8"; + } + + if (!set_encoding(dec, encoding)) { + free_file_decoder(dec); + return NULL; + } + + return dec; +} + +void free_file_decoder(FileDecoder *dec) +{ + if (dec->cconv) { + cconv_free(dec->cconv); + } + free(dec); +} + +bool file_decoder_read_line(FileDecoder *dec, const char **linep, size_t *lenp) +{ + return dec->read_line(dec, linep, lenp); +} + +#endif |
