diff options
Diffstat (limited to '')
-rw-r--r-- | libwc/utf8.c | 346 |
1 files changed, 346 insertions, 0 deletions
diff --git a/libwc/utf8.c b/libwc/utf8.c new file mode 100644 index 0000000..e523139 --- /dev/null +++ b/libwc/utf8.c @@ -0,0 +1,346 @@ + +#ifdef USE_UNICODE + +#include "wc.h" +#include "ucs.h" +#include "utf8.h" +#include "wtf.h" + +wc_uint8 WC_UTF8_MAP[ 0x100 ] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, +}; + +static wc_uchar utf8_buf[7]; + +size_t +wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8) +{ + if (ucs < WC_C_UTF8_L2) { + utf8[0] = ucs; + utf8[1] = 0; + return 1; + } else if (ucs < WC_C_UTF8_L3) { + utf8[0] = (ucs >> 6) | 0xc0; + utf8[1] = (ucs & 0x3f) | 0x80; + utf8[2] = 0; + return 2; + } else if (ucs < WC_C_UTF8_L4) { + utf8[0] = (ucs >> 12) | 0xe0; + utf8[1] = ((ucs >> 6) & 0x3f) | 0x80; + utf8[2] = (ucs & 0x3f) | 0x80; + utf8[3] = 0; + return 3; + } else if (ucs < WC_C_UTF8_L5) { + utf8[0] = (ucs >> 18) | 0xf0; + utf8[1] = ((ucs >> 12) & 0x3f) | 0x80; + utf8[2] = ((ucs >> 6) & 0x3f) | 0x80; + utf8[3] = (ucs & 0x3f) | 0x80; + utf8[4] = 0; + return 4; + } else if (ucs < WC_C_UTF8_L6) { + utf8[0] = (ucs >> 24) | 0xf8; + utf8[1] = ((ucs >> 18) & 0x3f) | 0x80; + utf8[2] = ((ucs >> 12) & 0x3f) | 0x80; + utf8[3] = ((ucs >> 6) & 0x3f) | 0x80; + utf8[4] = (ucs & 0x3f) | 0x80; + utf8[5] = 0; + return 5; + } else if (ucs <= WC_C_UCS4_END) { + utf8[0] = (ucs >> 30) | 0xfc; + utf8[1] = ((ucs >> 24) & 0x3f) | 0x80; + utf8[2] = ((ucs >> 18) & 0x3f) | 0x80; + utf8[3] = ((ucs >> 12) & 0x3f) | 0x80; + utf8[4] = ((ucs >> 6) & 0x3f) | 0x80; + utf8[5] = (ucs & 0x3f) | 0x80; + utf8[6] = 0; + return 6; + } else { + utf8[0] = 0; + return 0; + } +} + +wc_uint32 +wc_utf8_to_ucs(wc_uchar *utf8) +{ + wc_uint32 ucs; + + switch (WC_UTF8_MAP[utf8[0]]) { + case 1: + ucs = (wc_uint32) utf8[0]; + if (ucs >= WC_C_UTF8_L2) + break; + return ucs; + case 2: + ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6) + | (wc_uint32)(utf8[1] & 0x3f); + if (ucs < WC_C_UTF8_L2) + break; + return ucs; + case 3: + ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12) + | ((wc_uint32)(utf8[1] & 0x3f) << 6) + | (wc_uint32)(utf8[2] & 0x3f); + if (ucs < WC_C_UTF8_L3) + break; + return ucs; + case 4: + ucs = ((wc_uint32)(utf8[0] & 0x07) << 18) + | ((wc_uint32)(utf8[1] & 0x3f) << 12) + | ((wc_uint32)(utf8[2] & 0x3f) << 6) + | (wc_uint32)(utf8[3] & 0x3f); + if (ucs < WC_C_UTF8_L4) + break; + return ucs; + case 5: + ucs = ((wc_uint32)(utf8[0] & 0x03) << 24) + | ((wc_uint32)(utf8[1] & 0x3f) << 18) + | ((wc_uint32)(utf8[2] & 0x3f) << 12) + | ((wc_uint32)(utf8[3] & 0x3f) << 6) + | (wc_uint32)(utf8[4] & 0x3f); + if (ucs < WC_C_UTF8_L5) + break; + return ucs; + case 6: + ucs = ((wc_uint32)(utf8[0] & 0x01) << 30) + | ((wc_uint32)(utf8[1] & 0x3f) << 24) + | ((wc_uint32)(utf8[2] & 0x3f) << 18) + | ((wc_uint32)(utf8[3] & 0x3f) << 12) + | ((wc_uint32)(utf8[4] & 0x3f) << 6) + | (wc_uint32)(utf8[5] & 0x3f); + if (ucs < WC_C_UTF8_L6) + break; + return ucs; + default: + break; + } + return WC_C_UCS4_ERROR; +} + +Str +wc_conv_from_utf8(Str is, wc_ces ces) +{ + Str os; + wc_uchar *sp = (wc_uchar *)is->ptr; + wc_uchar *ep = sp + is->length; + wc_uchar *p; + wc_uchar *q = NULL; + int state = WC_UTF8_NOSTATE; + size_t next = 0; + wc_uint32 ucs; + wc_status st; + + for (p = sp; p < ep && *p < 0x80; p++) + ; + if (p == ep) + return is; + os = Strnew_size(is->length * 4 / 3); + if (p > sp) + Strcat_charp_n(os, is->ptr, (int)(p - sp)); + + st.tag = NULL; + st.ntag = 0; + for (; p < ep; p++) { + switch (state) { + case WC_UTF8_NOSTATE: + next = WC_UTF8_MAP[*p]; + switch (next) { + case 1: + wtf_push_ucs(os, (wc_uint32)*p, &st); + break; + case 8: + Strcat_char(os, (char)*p); + break; + case 0: + case 7: + wtf_push_unknown(os, p, 1); + break; + default: + q = p; + next--; + state = WC_UTF8_NEXT; + break; + } + break; + case WC_UTF8_NEXT: + if (WC_UTF8_MAP[*p]) { + wtf_push_unknown(os, q, p - q + 1); + state = WC_UTF8_NOSTATE; + break; + } + if (--next) + break; + state = WC_UTF8_NOSTATE; + ucs = wc_utf8_to_ucs(q); + if (ucs == WC_C_UCS4_ERROR || + (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END)) + wtf_push_unknown(os, q, p - q + 1); + else if (ucs != WC_C_UCS2_BOM) + wtf_push_ucs(os, ucs, &st); + break; + } + } + switch (state) { + case WC_UTF8_NEXT: + wtf_push_unknown(os, q, p - q); + break; + } + return os; +} + +static int +wc_push_tag_to_utf8(Str os, int ntag) +{ + char *p; + + if (ntag) { + p = wc_ucs_get_tag(ntag); + if (p == NULL) + ntag = 0; + } + if (ntag) { + wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf); + Strcat_charp(os, (char *)utf8_buf); + for (; *p; p++) { + wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf); + Strcat_charp(os, (char *)utf8_buf); + } + } else { + wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf); + Strcat_charp(os, (char *)utf8_buf); + } + return ntag; +} + +void +wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st) +{ + while (1) { + switch (WC_CCS_SET(cc.ccs)) { + case WC_CCS_US_ASCII: + if (st->ntag) + st->ntag = wc_push_tag_to_utf8(os, 0); + Strcat_char(os, (char)(cc.code & 0x7f)); + return; + case WC_CCS_UCS2: + case WC_CCS_UCS4: + if (st->ntag) + st->ntag = wc_push_tag_to_utf8(os, 0); + wc_ucs_to_utf8(cc.code, utf8_buf); + Strcat_charp(os, (char *)utf8_buf); + return; + case WC_CCS_UCS_TAG: + if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag) + st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code)); + wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf); + Strcat_charp(os, (char *)utf8_buf); + return; + case WC_CCS_ISO_8859_1: + if (st->ntag) + st->ntag = wc_push_tag_to_utf8(os, 0); + wc_ucs_to_utf8((cc.code | 0x80), utf8_buf); + Strcat_charp(os, (char *)utf8_buf); + return; + case WC_CCS_UNKNOWN_W: + if (!WcOption.no_replace) { + if (st->ntag) + st->ntag = wc_push_tag_to_utf8(os, 0); + Strcat_charp(os, WC_REPLACE_W); + } + return; + case WC_CCS_UNKNOWN: + if (!WcOption.no_replace) { + if (st->ntag) + st->ntag = wc_push_tag_to_utf8(os, 0); + Strcat_charp(os, WC_REPLACE); + } + return; + default: + if (WcOption.ucs_conv && + (cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR) + cc.ccs = WC_CCS_UCS2; + else + cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; + continue; + } + } +} + +void +wc_push_to_utf8_end(Str os, wc_status *st) +{ + if (st->ntag) + st->ntag = wc_push_tag_to_utf8(os, 0); + return; +} + +Str +wc_char_conv_from_utf8(wc_uchar c, wc_status *st) +{ + static Str os; + static wc_uchar buf[6]; + static size_t nbuf, next; + wc_uint32 ucs; + + if (st->state == -1) { + st->state = WC_UTF8_NOSTATE; + os = Strnew_size(8); + st->tag = NULL; + st->ntag = 0; + nbuf = 0; + } + + switch (st->state) { + case WC_UTF8_NOSTATE: + switch (next = WC_UTF8_MAP[c]) { + case 1: + wtf_push_ucs(os, (wc_uint32)c, st); + break; + case 8: + Strcat_char(os, (char)c); + break; + case 0: + case 7: + break; + default: + buf[nbuf++] = c; + next--; + st->state = WC_UTF8_NEXT; + return NULL; + } + break; + case WC_UTF8_NEXT: + if (WC_UTF8_MAP[c]) + break; + buf[nbuf++] = c; + if (--next) + return NULL; + ucs = wc_utf8_to_ucs(buf); + if (ucs == WC_C_UCS4_ERROR || + (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END)) + break; + if (ucs != WC_C_UCS2_BOM) + wtf_push_ucs(os, ucs, st); + break; + } + st->state = -1; + return os; +} + +#endif |