aboutsummaryrefslogtreecommitdiffstats
path: root/libwc/utf8.c
diff options
context:
space:
mode:
authorTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:18:09 +0000
committerTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:18:09 +0000
commit5f8e0f8ef9a422691dd72e8a953a42a41478fcb4 (patch)
tree4b2df4796a534793648b3c4fc532fc36bd0cd525 /libwc/utf8.c
parentReleasing debian version 0.3-2.4 (diff)
downloadw3m-5f8e0f8ef9a422691dd72e8a953a42a41478fcb4.tar.gz
w3m-5f8e0f8ef9a422691dd72e8a953a42a41478fcb4.zip
Releasing debian version 0.5.1-1debian/0.5.1-1
Diffstat (limited to 'libwc/utf8.c')
-rw-r--r--libwc/utf8.c346
1 files changed, 346 insertions, 0 deletions
diff --git a/libwc/utf8.c b/libwc/utf8.c
new file mode 100644
index 0000000..e523139
--- /dev/null
+++ b/libwc/utf8.c
@@ -0,0 +1,346 @@
+
+#ifdef USE_UNICODE
+
+#include "wc.h"
+#include "ucs.h"
+#include "utf8.h"
+#include "wtf.h"
+
+wc_uint8 WC_UTF8_MAP[ 0x100 ] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7,
+};
+
+static wc_uchar utf8_buf[7];
+
+size_t
+wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8)
+{
+ if (ucs < WC_C_UTF8_L2) {
+ utf8[0] = ucs;
+ utf8[1] = 0;
+ return 1;
+ } else if (ucs < WC_C_UTF8_L3) {
+ utf8[0] = (ucs >> 6) | 0xc0;
+ utf8[1] = (ucs & 0x3f) | 0x80;
+ utf8[2] = 0;
+ return 2;
+ } else if (ucs < WC_C_UTF8_L4) {
+ utf8[0] = (ucs >> 12) | 0xe0;
+ utf8[1] = ((ucs >> 6) & 0x3f) | 0x80;
+ utf8[2] = (ucs & 0x3f) | 0x80;
+ utf8[3] = 0;
+ return 3;
+ } else if (ucs < WC_C_UTF8_L5) {
+ utf8[0] = (ucs >> 18) | 0xf0;
+ utf8[1] = ((ucs >> 12) & 0x3f) | 0x80;
+ utf8[2] = ((ucs >> 6) & 0x3f) | 0x80;
+ utf8[3] = (ucs & 0x3f) | 0x80;
+ utf8[4] = 0;
+ return 4;
+ } else if (ucs < WC_C_UTF8_L6) {
+ utf8[0] = (ucs >> 24) | 0xf8;
+ utf8[1] = ((ucs >> 18) & 0x3f) | 0x80;
+ utf8[2] = ((ucs >> 12) & 0x3f) | 0x80;
+ utf8[3] = ((ucs >> 6) & 0x3f) | 0x80;
+ utf8[4] = (ucs & 0x3f) | 0x80;
+ utf8[5] = 0;
+ return 5;
+ } else if (ucs <= WC_C_UCS4_END) {
+ utf8[0] = (ucs >> 30) | 0xfc;
+ utf8[1] = ((ucs >> 24) & 0x3f) | 0x80;
+ utf8[2] = ((ucs >> 18) & 0x3f) | 0x80;
+ utf8[3] = ((ucs >> 12) & 0x3f) | 0x80;
+ utf8[4] = ((ucs >> 6) & 0x3f) | 0x80;
+ utf8[5] = (ucs & 0x3f) | 0x80;
+ utf8[6] = 0;
+ return 6;
+ } else {
+ utf8[0] = 0;
+ return 0;
+ }
+}
+
+wc_uint32
+wc_utf8_to_ucs(wc_uchar *utf8)
+{
+ wc_uint32 ucs;
+
+ switch (WC_UTF8_MAP[utf8[0]]) {
+ case 1:
+ ucs = (wc_uint32) utf8[0];
+ if (ucs >= WC_C_UTF8_L2)
+ break;
+ return ucs;
+ case 2:
+ ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6)
+ | (wc_uint32)(utf8[1] & 0x3f);
+ if (ucs < WC_C_UTF8_L2)
+ break;
+ return ucs;
+ case 3:
+ ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12)
+ | ((wc_uint32)(utf8[1] & 0x3f) << 6)
+ | (wc_uint32)(utf8[2] & 0x3f);
+ if (ucs < WC_C_UTF8_L3)
+ break;
+ return ucs;
+ case 4:
+ ucs = ((wc_uint32)(utf8[0] & 0x07) << 18)
+ | ((wc_uint32)(utf8[1] & 0x3f) << 12)
+ | ((wc_uint32)(utf8[2] & 0x3f) << 6)
+ | (wc_uint32)(utf8[3] & 0x3f);
+ if (ucs < WC_C_UTF8_L4)
+ break;
+ return ucs;
+ case 5:
+ ucs = ((wc_uint32)(utf8[0] & 0x03) << 24)
+ | ((wc_uint32)(utf8[1] & 0x3f) << 18)
+ | ((wc_uint32)(utf8[2] & 0x3f) << 12)
+ | ((wc_uint32)(utf8[3] & 0x3f) << 6)
+ | (wc_uint32)(utf8[4] & 0x3f);
+ if (ucs < WC_C_UTF8_L5)
+ break;
+ return ucs;
+ case 6:
+ ucs = ((wc_uint32)(utf8[0] & 0x01) << 30)
+ | ((wc_uint32)(utf8[1] & 0x3f) << 24)
+ | ((wc_uint32)(utf8[2] & 0x3f) << 18)
+ | ((wc_uint32)(utf8[3] & 0x3f) << 12)
+ | ((wc_uint32)(utf8[4] & 0x3f) << 6)
+ | (wc_uint32)(utf8[5] & 0x3f);
+ if (ucs < WC_C_UTF8_L6)
+ break;
+ return ucs;
+ default:
+ break;
+ }
+ return WC_C_UCS4_ERROR;
+}
+
+Str
+wc_conv_from_utf8(Str is, wc_ces ces)
+{
+ Str os;
+ wc_uchar *sp = (wc_uchar *)is->ptr;
+ wc_uchar *ep = sp + is->length;
+ wc_uchar *p;
+ wc_uchar *q = NULL;
+ int state = WC_UTF8_NOSTATE;
+ size_t next = 0;
+ wc_uint32 ucs;
+ wc_status st;
+
+ for (p = sp; p < ep && *p < 0x80; p++)
+ ;
+ if (p == ep)
+ return is;
+ os = Strnew_size(is->length * 4 / 3);
+ if (p > sp)
+ Strcat_charp_n(os, is->ptr, (int)(p - sp));
+
+ st.tag = NULL;
+ st.ntag = 0;
+ for (; p < ep; p++) {
+ switch (state) {
+ case WC_UTF8_NOSTATE:
+ next = WC_UTF8_MAP[*p];
+ switch (next) {
+ case 1:
+ wtf_push_ucs(os, (wc_uint32)*p, &st);
+ break;
+ case 8:
+ Strcat_char(os, (char)*p);
+ break;
+ case 0:
+ case 7:
+ wtf_push_unknown(os, p, 1);
+ break;
+ default:
+ q = p;
+ next--;
+ state = WC_UTF8_NEXT;
+ break;
+ }
+ break;
+ case WC_UTF8_NEXT:
+ if (WC_UTF8_MAP[*p]) {
+ wtf_push_unknown(os, q, p - q + 1);
+ state = WC_UTF8_NOSTATE;
+ break;
+ }
+ if (--next)
+ break;
+ state = WC_UTF8_NOSTATE;
+ ucs = wc_utf8_to_ucs(q);
+ if (ucs == WC_C_UCS4_ERROR ||
+ (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
+ wtf_push_unknown(os, q, p - q + 1);
+ else if (ucs != WC_C_UCS2_BOM)
+ wtf_push_ucs(os, ucs, &st);
+ break;
+ }
+ }
+ switch (state) {
+ case WC_UTF8_NEXT:
+ wtf_push_unknown(os, q, p - q);
+ break;
+ }
+ return os;
+}
+
+static int
+wc_push_tag_to_utf8(Str os, int ntag)
+{
+ char *p;
+
+ if (ntag) {
+ p = wc_ucs_get_tag(ntag);
+ if (p == NULL)
+ ntag = 0;
+ }
+ if (ntag) {
+ wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf);
+ Strcat_charp(os, (char *)utf8_buf);
+ for (; *p; p++) {
+ wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf);
+ Strcat_charp(os, (char *)utf8_buf);
+ }
+ } else {
+ wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf);
+ Strcat_charp(os, (char *)utf8_buf);
+ }
+ return ntag;
+}
+
+void
+wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st)
+{
+ while (1) {
+ switch (WC_CCS_SET(cc.ccs)) {
+ case WC_CCS_US_ASCII:
+ if (st->ntag)
+ st->ntag = wc_push_tag_to_utf8(os, 0);
+ Strcat_char(os, (char)(cc.code & 0x7f));
+ return;
+ case WC_CCS_UCS2:
+ case WC_CCS_UCS4:
+ if (st->ntag)
+ st->ntag = wc_push_tag_to_utf8(os, 0);
+ wc_ucs_to_utf8(cc.code, utf8_buf);
+ Strcat_charp(os, (char *)utf8_buf);
+ return;
+ case WC_CCS_UCS_TAG:
+ if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag)
+ st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code));
+ wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf);
+ Strcat_charp(os, (char *)utf8_buf);
+ return;
+ case WC_CCS_ISO_8859_1:
+ if (st->ntag)
+ st->ntag = wc_push_tag_to_utf8(os, 0);
+ wc_ucs_to_utf8((cc.code | 0x80), utf8_buf);
+ Strcat_charp(os, (char *)utf8_buf);
+ return;
+ case WC_CCS_UNKNOWN_W:
+ if (!WcOption.no_replace) {
+ if (st->ntag)
+ st->ntag = wc_push_tag_to_utf8(os, 0);
+ Strcat_charp(os, WC_REPLACE_W);
+ }
+ return;
+ case WC_CCS_UNKNOWN:
+ if (!WcOption.no_replace) {
+ if (st->ntag)
+ st->ntag = wc_push_tag_to_utf8(os, 0);
+ Strcat_charp(os, WC_REPLACE);
+ }
+ return;
+ default:
+ if (WcOption.ucs_conv &&
+ (cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR)
+ cc.ccs = WC_CCS_UCS2;
+ else
+ cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
+ continue;
+ }
+ }
+}
+
+void
+wc_push_to_utf8_end(Str os, wc_status *st)
+{
+ if (st->ntag)
+ st->ntag = wc_push_tag_to_utf8(os, 0);
+ return;
+}
+
+Str
+wc_char_conv_from_utf8(wc_uchar c, wc_status *st)
+{
+ static Str os;
+ static wc_uchar buf[6];
+ static size_t nbuf, next;
+ wc_uint32 ucs;
+
+ if (st->state == -1) {
+ st->state = WC_UTF8_NOSTATE;
+ os = Strnew_size(8);
+ st->tag = NULL;
+ st->ntag = 0;
+ nbuf = 0;
+ }
+
+ switch (st->state) {
+ case WC_UTF8_NOSTATE:
+ switch (next = WC_UTF8_MAP[c]) {
+ case 1:
+ wtf_push_ucs(os, (wc_uint32)c, st);
+ break;
+ case 8:
+ Strcat_char(os, (char)c);
+ break;
+ case 0:
+ case 7:
+ break;
+ default:
+ buf[nbuf++] = c;
+ next--;
+ st->state = WC_UTF8_NEXT;
+ return NULL;
+ }
+ break;
+ case WC_UTF8_NEXT:
+ if (WC_UTF8_MAP[c])
+ break;
+ buf[nbuf++] = c;
+ if (--next)
+ return NULL;
+ ucs = wc_utf8_to_ucs(buf);
+ if (ucs == WC_C_UCS4_ERROR ||
+ (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
+ break;
+ if (ucs != WC_C_UCS2_BOM)
+ wtf_push_ucs(os, ucs, st);
+ break;
+ }
+ st->state = -1;
+ return os;
+}
+
+#endif