aboutsummaryrefslogtreecommitdiffstats
path: root/libwc/detect.c
diff options
context:
space:
mode:
authorTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:18:09 +0000
committerTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:18:09 +0000
commit5f8e0f8ef9a422691dd72e8a953a42a41478fcb4 (patch)
tree4b2df4796a534793648b3c4fc532fc36bd0cd525 /libwc/detect.c
parentReleasing debian version 0.3-2.4 (diff)
downloadw3m-5f8e0f8ef9a422691dd72e8a953a42a41478fcb4.tar.gz
w3m-5f8e0f8ef9a422691dd72e8a953a42a41478fcb4.zip
Releasing debian version 0.5.1-1debian/0.5.1-1
Diffstat (limited to '')
-rw-r--r--libwc/detect.c544
1 files changed, 544 insertions, 0 deletions
diff --git a/libwc/detect.c b/libwc/detect.c
new file mode 100644
index 0000000..eea2d5d
--- /dev/null
+++ b/libwc/detect.c
@@ -0,0 +1,544 @@
+
+#include "wc.h"
+#include "iso2022.h"
+#include "sjis.h"
+#include "big5.h"
+#include "hz.h"
+#include "viet.h"
+#ifdef USE_UNICODE
+#include "utf8.h"
+#include "utf7.h"
+#endif
+
+wc_uint8 WC_DETECT_MAP[ 0x100 ] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+};
+
+#define DETECT_NORMAL 0
+#define DETECT_POSSIBLE 1
+#define DETECT_OK 2
+#define DETECT_BROKEN 4
+#define DETECT_ERROR 8
+#define SET_DETECT(x,y) ((x) |= (y))
+#define SET_BROKEN_ERROR(x) ((x) = ((x) & DETECT_BROKEN) ? DETECT_ERROR : ((x) | DETECT_BROKEN))
+
+void
+wc_create_detect_map(wc_ces ces, wc_bool esc)
+{
+ static wc_ces detect_ces = WC_CES_US_ASCII;
+ int i;
+
+ if (ces != detect_ces) {
+ if (ces & WC_CES_T_VIET) {
+ wc_uint8 *map = NULL;
+ switch (ces) {
+ case WC_CES_TCVN_5712:
+ map = wc_c0_tcvn57122_map;
+ break;
+ case WC_CES_VISCII_11:
+ map = wc_c0_viscii112_map;
+ break;
+ case WC_CES_VPS:
+ map = wc_c0_vps2_map;
+ break;
+ }
+ for (i = 0; i < 0x20; i++)
+ WC_DETECT_MAP[i] = map[i] ? 1 : 0;
+ } else {
+ for (i = 0; i < 0x20; i++)
+ WC_DETECT_MAP[i] = 0;
+ WC_DETECT_MAP[WC_C_HZ_TILDA] = (ces == WC_CES_HZ_GB_2312) ? 1 : 0;
+#ifdef USE_UNICODE
+ WC_DETECT_MAP[WC_C_UTF7_PLUS] = (ces == WC_CES_UTF_7) ? 1 : 0;
+#endif
+ }
+ detect_ces = ces;
+ }
+ WC_DETECT_MAP[WC_C_ESC] = (esc || (ces & WC_CES_T_ISO_2022)) ? 1 : 0;
+ return;
+}
+
+wc_ces
+wc_auto_detect(char *is, size_t len, wc_ces hint)
+{
+ wc_uchar *p = (wc_uchar *)is;
+ wc_uchar *ep = p + len;
+ wc_uchar *q;
+ wc_ces euc = 0, priv = 0;
+ wc_status st;
+ int euc_state = 0, sjis_state = 0, big5_state = 0, hz_state = 0;
+ int iso_detect = DETECT_ERROR, euc_detect = DETECT_ERROR,
+ sjis_detect = DETECT_ERROR, big5_detect = DETECT_ERROR,
+ hz_detect = DETECT_ERROR, latin_detect = DETECT_ERROR,
+ priv_detect = DETECT_ERROR;
+ int possible = 0;
+ wc_bool iso2022jp2 = WC_FALSE, iso2022jp3 = WC_FALSE,
+ iso2022cn = WC_FALSE, iso2022kr = WC_FALSE, ok = WC_FALSE;
+#ifdef USE_UNICODE
+ int utf8_state = 0;
+ int utf8_detect = DETECT_ERROR;
+ int utf8_next = 0;
+#endif
+
+ wc_create_detect_map(hint, WC_TRUE);
+ for (; p < ep && ! WC_DETECT_MAP[*p]; p++)
+ ;
+ if (p == ep)
+ return WC_CES_US_ASCII;
+
+ switch (hint) {
+ case WC_CES_ISO_2022_JP:
+ case WC_CES_ISO_2022_JP_2:
+ case WC_CES_ISO_2022_JP_3:
+ case WC_CES_EUC_JP:
+ case WC_CES_SHIFT_JIS:
+ case WC_CES_SHIFT_JISX0213:
+ euc = WC_CES_EUC_JP;
+ euc_state = WC_EUC_NOSTATE;
+ sjis_state = WC_SJIS_NOSTATE;
+ iso_detect = euc_detect = sjis_detect = DETECT_NORMAL;
+ possible = 3;
+ break;
+ case WC_CES_ISO_2022_CN:
+ case WC_CES_EUC_CN:
+ euc = WC_CES_EUC_CN;
+ euc_state = WC_EUC_NOSTATE;
+ big5_state = WC_BIG5_NOSTATE;
+ iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
+ possible = 3;
+ break;
+ case WC_CES_EUC_TW:
+ case WC_CES_BIG5:
+ euc = WC_CES_EUC_TW;
+ euc_state = WC_EUC_NOSTATE;
+ big5_state = WC_BIG5_NOSTATE;
+ iso_detect = euc_detect = big5_detect = DETECT_NORMAL;
+ possible = 3;
+ break;
+ case WC_CES_HZ_GB_2312:
+ euc = WC_CES_EUC_CN;
+ euc_state = WC_EUC_NOSTATE;
+ hz_state = WC_HZ_NOSTATE;
+ iso_detect = euc_detect = big5_detect = hz_detect = DETECT_NORMAL;
+ possible = 4;
+ break;
+ case WC_CES_ISO_2022_KR:
+ case WC_CES_EUC_KR:
+ euc = WC_CES_EUC_KR;
+ euc_state = WC_EUC_NOSTATE;
+ iso_detect = euc_detect = DETECT_NORMAL;
+ possible = 3;
+ break;
+#ifdef USE_UNICODE
+ case WC_CES_UTF_8:
+ iso_detect = DETECT_NORMAL;
+ possible = 1;
+ break;
+#endif
+ case WC_CES_US_ASCII:
+ iso_detect = latin_detect = DETECT_NORMAL;
+ possible = 2;
+ break;
+ default:
+ if (hint & WC_CES_T_ISO_8859) {
+ iso_detect = latin_detect = DETECT_NORMAL;
+ possible = 2;
+ } else {
+ iso_detect = priv_detect = DETECT_NORMAL;
+ priv = hint; /* for TVCN, VISCII, VPS */
+ possible = 2;
+ }
+ break;
+ }
+#ifdef USE_UNICODE
+ if (priv_detect == DETECT_ERROR) {
+ utf8_detect = DETECT_NORMAL;
+ possible++;
+ }
+#endif
+
+ wc_input_init(WC_CES_US_ASCII, &st);
+
+ for (; p < ep; p++) {
+ if (possible == 0 || (possible == 1 && ok))
+ break;
+ if (iso_detect != DETECT_ERROR) {
+ switch (*p) {
+ case WC_C_ESC:
+ if (*(p+1) == WC_C_MBCS) {
+ q = p;
+ if (! wc_parse_iso2022_esc(&q, &st))
+ break;
+ if (st.design[0] == WC_CCS_JIS_C_6226 ||
+ st.design[0] == WC_CCS_JIS_X_0208)
+ ;
+ else if (st.design[0] == WC_CCS_JIS_X_0213_1 ||
+ st.design[0] == WC_CCS_JIS_X_0213_2)
+ iso2022jp3 = WC_TRUE;
+ else if (WC_CCS_TYPE(st.design[0]) == WC_CCS_A_CS94W)
+ iso2022jp2 = WC_TRUE;
+ if (st.design[1] == WC_CCS_KS_X_1001)
+ iso2022kr = WC_TRUE;
+ else if (st.design[1] == WC_CCS_GB_2312 ||
+ st.design[1] == WC_CCS_ISO_IR_165 ||
+ st.design[1] == WC_CCS_CNS_11643_1)
+ iso2022cn = WC_TRUE;
+ if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS94W ||
+ WC_CCS_TYPE(st.design[3]) == WC_CCS_A_CS94W)
+ iso2022cn = WC_TRUE;
+ } else if (*(p+1) == WC_C_G2_CS96) {
+ q = p;
+ if (! wc_parse_iso2022_esc(&q, &st))
+ break;
+ if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS96)
+ iso2022jp2 = WC_TRUE;
+ } else if (*(p+1) == WC_C_CSWSR) {
+ q = p;
+ if (! wc_parse_iso2022_esc(&q, &st))
+ break;
+ possible = 0;
+ iso_detect = DETECT_BROKEN;
+ continue;
+ }
+ iso_detect = DETECT_OK;
+ ok = WC_TRUE;
+ break;
+ case WC_C_SI:
+ case WC_C_SO:
+ iso_detect = DETECT_OK;
+ ok = WC_TRUE;
+ iso2022cn = WC_TRUE;
+ iso2022kr = WC_TRUE;
+ break;
+ default:
+ if (*p & 0x80) {
+ iso_detect = DETECT_ERROR;
+ possible--;
+ }
+ break;
+ }
+ }
+ if (euc_detect != DETECT_ERROR) {
+ switch (euc_state) {
+ case WC_EUC_NOSTATE:
+ switch (WC_ISO_MAP[*p]) {
+ case WC_ISO_MAP_GR:
+ euc_state = WC_EUC_MBYTE1;
+ break;
+ case WC_ISO_MAP_SS2:
+ if (euc == WC_CES_EUC_JP)
+ euc_state = WC_EUC_MBYTE1;
+ else if (euc == WC_CES_EUC_TW)
+ euc_state = WC_EUC_TW_SS2;
+ else
+ euc_detect = DETECT_ERROR;
+ break;
+ case WC_ISO_MAP_SS3:
+ if (euc == WC_CES_EUC_JP &&
+ WC_ISO_MAP[*(p+1)] == WC_ISO_MAP_GR)
+ ;
+ else
+ euc_detect = DETECT_ERROR;
+ break;
+ case WC_ISO_MAP_C1:
+ case WC_ISO_MAP_GR96:
+ euc_detect = DETECT_ERROR;
+ break;
+ }
+ break;
+ case WC_EUC_MBYTE1:
+ if (WC_ISO_MAP[*p] == WC_ISO_MAP_GR) {
+ SET_DETECT(euc_detect, DETECT_OK);
+ ok = WC_TRUE;
+ } else
+ SET_BROKEN_ERROR(euc_detect);
+ euc_state = WC_EUC_NOSTATE;
+ break;
+ case WC_EUC_TW_SS2:
+ if (!( 0xa0 <= *p && *p <= 0xb0) ||
+ WC_ISO_MAP[*(p+1)] != WC_ISO_MAP_GR)
+ euc_detect = DETECT_ERROR;
+ euc_state = WC_EUC_NOSTATE;
+ break;
+ }
+ if (euc_detect == DETECT_ERROR)
+ possible--;
+ }
+ if (sjis_detect != DETECT_ERROR) {
+ switch (sjis_state) {
+ case WC_SJIS_NOSTATE:
+ switch (WC_SJIS_MAP[*p]) {
+ case WC_SJIS_MAP_SL:
+ case WC_SJIS_MAP_SH:
+ sjis_state = WC_SJIS_SHIFT_L;
+ break;
+ case WC_SJIS_MAP_SK:
+ SET_DETECT(sjis_detect, DETECT_POSSIBLE);
+ break;
+ case WC_SJIS_MAP_SX:
+ if (WcOption.use_jisx0213) {
+ sjis_state = WC_SJIS_SHIFT_X;
+ break;
+ }
+ case WC_SJIS_MAP_80:
+ case WC_SJIS_MAP_A0:
+ case WC_SJIS_MAP_C1:
+ sjis_detect = DETECT_ERROR;
+ break;
+ }
+ break;
+ case WC_SJIS_SHIFT_L:
+ if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB) {
+ SET_DETECT(sjis_detect, DETECT_OK);
+ ok = WC_TRUE;
+ } else
+ SET_BROKEN_ERROR(sjis_detect);
+ sjis_state = WC_SJIS_NOSTATE;
+ break;
+ case WC_SJIS_SHIFT_X:
+ if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB)
+ SET_DETECT(sjis_detect, DETECT_POSSIBLE);
+ else
+ sjis_detect = DETECT_ERROR;
+ sjis_state = WC_SJIS_NOSTATE;
+ break;
+ }
+ if (sjis_detect == DETECT_ERROR)
+ possible--;
+ }
+ if (big5_detect != DETECT_ERROR) {
+ switch (big5_state) {
+ case WC_BIG5_NOSTATE:
+ switch (WC_BIG5_MAP[*p]) {
+ case WC_BIG5_MAP_UB:
+ big5_state = WC_BIG5_MBYTE1;
+ break;
+ case WC_BIG5_MAP_C1:
+ big5_detect = DETECT_ERROR;
+ break;
+ }
+ break;
+ case WC_BIG5_MBYTE1:
+ if (WC_BIG5_MAP[*p] & WC_BIG5_MAP_LB) {
+ SET_DETECT(big5_detect, DETECT_OK);
+ ok = WC_TRUE;
+ } else
+ SET_BROKEN_ERROR(big5_detect);
+ big5_state = WC_BIG5_NOSTATE;
+ break;
+ }
+ if (big5_detect == DETECT_ERROR)
+ possible--;
+ }
+ if (hz_detect != DETECT_ERROR) {
+ if (*p & 0x80) {
+ hz_detect = DETECT_ERROR;
+ possible--;
+ } else {
+ switch (hz_state) {
+ case WC_HZ_NOSTATE:
+ if (*p == WC_C_HZ_TILDA)
+ hz_state = WC_HZ_TILDA;
+ break;
+ case WC_HZ_TILDA:
+ if (*p == WC_C_HZ_SI)
+ hz_state = WC_HZ_MBYTE;
+ else
+ hz_state = WC_HZ_NOSTATE;
+ break;
+ case WC_HZ_TILDA_MB:
+ if (*p == WC_C_HZ_SO)
+ hz_state = WC_HZ_NOSTATE;
+ else
+ hz_state = WC_HZ_MBYTE;
+ break;
+ case WC_HZ_MBYTE:
+ if (*p == WC_C_HZ_TILDA)
+ hz_state = WC_HZ_TILDA_MB;
+ else
+ hz_state = WC_HZ_MBYTE1;
+ break;
+ case WC_HZ_MBYTE1:
+ hz_detect = DETECT_OK;
+ ok = WC_TRUE;
+ hz_state = WC_HZ_NOSTATE;
+ break;
+ }
+ }
+ }
+ if (latin_detect != DETECT_ERROR) {
+ switch (WC_ISO_MAP[*p] & WC_ISO_MAP_CG) {
+ case WC_ISO_MAP_GR:
+ case WC_ISO_MAP_GR96:
+ SET_DETECT(latin_detect, DETECT_OK);
+ ok = WC_TRUE;
+ break;
+ case WC_ISO_MAP_C1:
+ latin_detect = DETECT_ERROR;
+ break;
+ }
+ if (latin_detect == DETECT_ERROR)
+ possible--;
+ }
+ if (priv_detect != DETECT_ERROR) {
+ if (*p != WC_C_ESC && WC_DETECT_MAP[*p]) {
+ SET_DETECT(priv_detect, DETECT_OK);
+ ok = WC_TRUE;
+ }
+/*
+ if (priv_detect == DETECT_ERROR)
+ possible--;
+*/
+ }
+#ifdef USE_UNICODE
+ if (utf8_detect != DETECT_ERROR) {
+ switch (utf8_state) {
+ case WC_UTF8_NOSTATE:
+ switch (utf8_next = WC_UTF8_MAP[*p]) {
+ case 1:
+ case 8:
+ break;
+ case 0:
+ case 7:
+ utf8_detect = DETECT_ERROR;
+ break;
+ default:
+ utf8_next--;
+ utf8_state = WC_UTF8_NEXT;
+ break;
+ }
+ break;
+ case WC_UTF8_NEXT:
+ if (WC_UTF8_MAP[*p]) {
+ utf8_detect = DETECT_ERROR;
+ utf8_state = WC_UTF8_NOSTATE;
+ break;
+ }
+ utf8_next--;
+ if (! utf8_next) {
+ SET_DETECT(utf8_detect, DETECT_OK);
+ ok = WC_TRUE;
+ utf8_state = WC_UTF8_NOSTATE;
+ }
+ break;
+ }
+ if (utf8_detect == DETECT_ERROR)
+ possible--;
+ }
+#endif
+ }
+
+ if (iso_detect != DETECT_ERROR) {
+ if (iso_detect == DETECT_NORMAL) {
+ if (hz_detect == DETECT_OK)
+ return WC_CES_HZ_GB_2312;
+ if (priv_detect == DETECT_OK)
+ return priv;
+ return WC_CES_US_ASCII;
+ }
+ switch (euc) {
+ case WC_CES_EUC_CN:
+ case WC_CES_EUC_TW:
+ if (iso2022cn)
+ return WC_CES_ISO_2022_CN;
+ break;
+ case WC_CES_EUC_KR:
+ if (iso2022kr)
+ return WC_CES_ISO_2022_KR;
+ break;
+ }
+ if (iso2022jp3)
+ return WC_CES_ISO_2022_JP_3;
+ if (iso2022jp2)
+ return WC_CES_ISO_2022_JP_2;
+ if (iso2022cn)
+ return WC_CES_ISO_2022_CN;
+ if (iso2022kr)
+ return WC_CES_ISO_2022_KR;
+ return WC_CES_ISO_2022_JP;
+ }
+ switch (hint) {
+ case WC_CES_ISO_2022_JP:
+ case WC_CES_ISO_2022_JP_2:
+ case WC_CES_ISO_2022_JP_3:
+ case WC_CES_ISO_2022_KR:
+ case WC_CES_ISO_2022_CN:
+ break;
+ case WC_CES_EUC_JP:
+ case WC_CES_EUC_CN:
+ case WC_CES_EUC_TW:
+ case WC_CES_EUC_KR:
+ if (euc_detect != DETECT_ERROR)
+ return hint;
+ break;
+ case WC_CES_SHIFT_JIS:
+ case WC_CES_SHIFT_JISX0213:
+ if (sjis_detect != DETECT_ERROR)
+ return hint;
+ break;
+ case WC_CES_BIG5:
+ if (big5_detect != DETECT_ERROR)
+ return hint;
+ break;
+#ifdef USE_UNICODE
+ case WC_CES_UTF_8:
+ return hint;
+#endif
+ case WC_CES_US_ASCII:
+#ifdef USE_UNICODE
+ if (utf8_detect != DETECT_ERROR)
+ return hint;
+#endif
+ if (latin_detect != DETECT_ERROR)
+ return WC_CES_ISO_8859_1;
+ return hint;
+ default:
+ if (latin_detect != DETECT_ERROR)
+ return hint;
+ if (priv_detect != DETECT_ERROR)
+ return hint;
+#ifdef USE_UNICODE
+ if (utf8_detect != DETECT_ERROR)
+ return WC_CES_UTF_8;
+#endif
+ return hint;
+ }
+ if (euc_detect == DETECT_OK)
+ return euc;
+ if (sjis_detect == DETECT_OK)
+ return WC_CES_SHIFT_JIS;
+ if (big5_detect == DETECT_OK)
+ return WC_CES_BIG5;
+#ifdef USE_UNICODE
+ if (utf8_detect == DETECT_OK)
+ return WC_CES_UTF_8;
+ if (sjis_detect & DETECT_POSSIBLE)
+ return WC_CES_SHIFT_JIS;
+#endif
+ if (euc_detect != DETECT_ERROR)
+ return euc;
+ if (sjis_detect != DETECT_ERROR)
+ return WC_CES_SHIFT_JIS;
+ if (big5_detect != DETECT_ERROR)
+ return WC_CES_BIG5;
+#ifdef USE_UNICODE
+ if (utf8_detect != DETECT_ERROR)
+ return WC_CES_UTF_8;
+#endif
+ return hint;
+}