aboutsummaryrefslogtreecommitdiffstats
path: root/libwc/charset.c
diff options
context:
space:
mode:
authorTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:05:14 +0000
committerTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:05:14 +0000
commit72f72d64a422d6628c4796f5c0bf2e508f134214 (patch)
tree0c9ea90cc53310832c977265521fb44db24a515e /libwc/charset.c
parentAdding upstream version 0.3 (diff)
downloadw3m-upstream/0.5.1.tar.gz
w3m-upstream/0.5.1.zip
Adding upstream version 0.5.1upstream/0.5.1
Diffstat (limited to 'libwc/charset.c')
-rw-r--r--libwc/charset.c492
1 files changed, 492 insertions, 0 deletions
diff --git a/libwc/charset.c b/libwc/charset.c
new file mode 100644
index 0000000..95343b3
--- /dev/null
+++ b/libwc/charset.c
@@ -0,0 +1,492 @@
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <gc.h>
+#define New_N(type,n) ((type*)GC_MALLOC((n)*sizeof(type)))
+
+#include "wc.h"
+
+wc_locale WcLocale = 0;
+
+static struct {
+ char *lang;
+ wc_ces ces;
+} lang_ces_table[] = {
+ { "cs", WC_CES_ISO_8859_2 }, /* cs_CZ */
+ { "el", WC_CES_ISO_8859_7 }, /* el_GR */
+ { "iw", WC_CES_ISO_8859_8 }, /* iw_IL */
+ { "ja", WC_CES_EUC_JP }, /* ja_JP */
+ { "ko", WC_CES_EUC_KR }, /* ko_KR */
+ { "hu", WC_CES_ISO_8859_2 }, /* hu_HU */
+ { "pl", WC_CES_ISO_8859_2 }, /* pl_PL */
+ { "ro", WC_CES_ISO_8859_2 }, /* ro_RO */
+ { "ru", WC_CES_ISO_8859_5 }, /* ru_SU */
+ { "sk", WC_CES_ISO_8859_2 }, /* sk_SK */
+ { "sl", WC_CES_ISO_8859_2 }, /* sl_CS */
+ { "tr", WC_CES_ISO_8859_9 }, /* tr_TR */
+ { "zh", WC_CES_EUC_CN }, /* zh_CN */
+ { NULL, 0 }
+};
+
+wc_ces
+wc_guess_charset(char *charset, wc_ces orig)
+{
+ wc_ces guess;
+
+ if (charset == NULL || *charset == '\0')
+ return orig;
+ guess = wc_charset_to_ces(charset);
+ return guess ? guess : orig;
+}
+
+wc_ces
+wc_guess_charset_short(char *charset, wc_ces orig)
+{
+ wc_ces guess;
+
+ if (charset == NULL || *charset == '\0')
+ return orig;
+ guess = wc_charset_short_to_ces(charset);
+ return guess ? guess : orig;
+}
+
+wc_ces
+wc_guess_locale_charset(char *locale, wc_ces orig)
+{
+ wc_ces guess;
+
+ if (locale == NULL || *locale == '\0')
+ return orig;
+ guess = wc_locale_to_ces(locale);
+ return guess ? guess : orig;
+}
+
+wc_ces
+wc_charset_to_ces(char *charset)
+{
+ char *p = charset;
+ char buf[16];
+ int n;
+
+ if (tolower(*p) == 'x' && *(p+1) == '-')
+ p += 2;
+ for (n = 0; *p && n < 15; p++) {
+ if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
+ buf[n++] = tolower(*p);
+ }
+ buf[n] = 0;
+ p = buf;
+ switch (*p) {
+ case 'e':
+ if (! strncmp(p, "euc", 3)) {
+ p += 3;
+ switch (*p) {
+ case 'j': return WC_CES_EUC_JP;
+ case 'c': return WC_CES_EUC_CN;
+ case 't': return WC_CES_EUC_TW;
+ case 'k': return WC_CES_EUC_KR;
+ }
+ switch (WcLocale) {
+ case WC_LOCALE_JA_JP: return WC_CES_EUC_JP;
+ case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN;
+ case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW;
+ case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN;
+ case WC_LOCALE_KO_KR: return WC_CES_EUC_KR;
+ }
+ return WC_CES_EUC_JP;
+ }
+ break;
+ case 'i':
+ if (! strncmp(p, "iso2022", 7)) {
+ p += 7;
+ switch (*p) {
+ case 'j':
+ if (! strncmp(p, "jp2", 3))
+ return WC_CES_ISO_2022_JP_2;
+ if (! strncmp(p, "jp3", 3))
+ return WC_CES_ISO_2022_JP_3;
+ return WC_CES_ISO_2022_JP;
+ case 'c': return WC_CES_ISO_2022_CN;
+ case 'k': return WC_CES_ISO_2022_KR;
+ }
+ return WC_CES_ISO_2022_JP;
+ } else if (! strncmp(p, "iso8859", 7)) {
+ n = atoi(p + 7);
+ if (n >= 1 && n <= 16 && n != 12)
+ return (WC_CES_E_ISO_8859 | n);
+ return WC_CES_ISO_8859_1;
+ }
+ break;
+ case 'j':
+ if (! strncmp(p, "johab", 5))
+ return WC_CES_JOHAB;
+ if (! strncmp(p, "jis", 3))
+ return WC_CES_ISO_2022_JP;
+ break;
+ case 's':
+ if (! strncmp(p, "shiftjisx0213", 13) ||
+ ! strncmp(p, "sjisx0213", 9))
+ return WC_CES_SHIFT_JISX0213;
+ if (! strncmp(p, "shiftjis", 8) ||
+ ! strncmp(p, "sjis", 4))
+ return WC_CES_SHIFT_JIS;
+ break;
+ case 'g':
+ if (! strncmp(p, "gb18030", 7) ||
+ ! strncmp(p, "gbk2k", 5))
+ return WC_CES_GB18030;
+ if (! strncmp(p, "gbk", 3))
+ return WC_CES_GBK;
+ if (! strncmp(p, "gb2312", 6))
+ return WC_CES_EUC_CN;
+ break;
+ case 'b':
+ if (! strncmp(p, "big5hkscs", 9))
+ return WC_CES_HKSCS;
+ if (! strncmp(p, "big5", 4))
+ return WC_CES_BIG5;
+ break;
+ case 'h':
+ if (! strncmp(p, "hz", 2))
+ return WC_CES_HZ_GB_2312;
+ if (! strncmp(p, "hkscs", 5))
+ return WC_CES_HKSCS;
+ break;
+ case 'k':
+ if (! strncmp(p, "koi8r", 5))
+ return WC_CES_KOI8_R;
+ if (! strncmp(p, "koi8u", 5))
+ return WC_CES_KOI8_U;
+ if (! strncmp(p, "ksx1001", 7))
+ return WC_CES_EUC_KR;
+ if (! strncmp(p, "ksc5601", 7))
+ return WC_CES_EUC_KR;
+ break;
+ case 't':
+ if (! strncmp(p, "tis620", 6))
+ return WC_CES_TIS_620;
+ if (! strncmp(p, "tcvn", 4))
+ return WC_CES_TCVN_5712;
+ break;
+ case 'n':
+ if (! strncmp(p, "next", 4))
+ return WC_CES_NEXTSTEP;
+ break;
+ case 'v':
+ if (! strncmp(p, "viet", 4)) {
+ p += 4;
+ if (! strncmp(p, "tcvn", 4))
+ return WC_CES_TCVN_5712;
+ }
+ if (! strncmp(p, "viscii", 6))
+ return WC_CES_VISCII_11;
+ if (! strncmp(p, "vps", 3))
+ return WC_CES_VPS;
+ break;
+ case 'u':
+#ifdef USE_UNICODE
+ if (! strncmp(p, "utf8", 4))
+ return WC_CES_UTF_8;
+ if (! strncmp(p, "utf7", 4))
+ return WC_CES_UTF_7;
+#endif
+ if (! strncmp(p, "uhc", 3))
+ return WC_CES_UHC;
+ if (! strncmp(p, "ujis", 4))
+ return WC_CES_EUC_JP;
+ if (! strncmp(p, "usascii", 7))
+ return WC_CES_US_ASCII;
+ break;
+ case 'a':
+ if (! strncmp(p, "ascii", 5))
+ return WC_CES_US_ASCII;
+ break;
+ case 'c':
+ if (! strncmp(p, "cngb", 4))
+ return WC_CES_EUC_CN;
+ if (*(p+1) != 'p')
+ break;
+ n = atoi(p + 2);
+ switch (n) {
+ case 437: return WC_CES_CP437;
+ case 737: return WC_CES_CP737;
+ case 775: return WC_CES_CP775;
+ case 850: return WC_CES_CP850;
+ case 852: return WC_CES_CP852;
+ case 855: return WC_CES_CP855;
+ case 856: return WC_CES_CP856;
+ case 857: return WC_CES_CP857;
+ case 860: return WC_CES_CP860;
+ case 861: return WC_CES_CP861;
+ case 862: return WC_CES_CP862;
+ case 863: return WC_CES_CP863;
+ case 864: return WC_CES_CP864;
+ case 865: return WC_CES_CP865;
+ case 866: return WC_CES_CP866;
+ case 869: return WC_CES_CP869;
+ case 874: return WC_CES_CP874;
+ case 932: return WC_CES_CP932; /* CP932 = Shift_JIS */
+ case 936: return WC_CES_CP936; /* CP936 = GBK > EUC_CN */
+ case 949: return WC_CES_CP949; /* CP949 = UHC > EUC_KR */
+ case 950: return WC_CES_CP950; /* CP950 = Big5 */
+ case 1006: return WC_CES_CP1006;
+ case 1250: return WC_CES_CP1250;
+ case 1251: return WC_CES_CP1251;
+ case 1252: return WC_CES_CP1252;
+ case 1253: return WC_CES_CP1253;
+ case 1254: return WC_CES_CP1254;
+ case 1255: return WC_CES_CP1255;
+ case 1256: return WC_CES_CP1256;
+ case 1257: return WC_CES_CP1257;
+ case 1258: return WC_CES_CP1258;
+ }
+ break;
+ case 'w':
+ if (strncmp(p, "windows", 7))
+ break;
+ if (! strncmp(p, "31j", 3))
+ return WC_CES_CP932;
+ n = atoi(p + 7);
+ switch (n) {
+ case 1250: return WC_CES_CP1250;
+ case 1251: return WC_CES_CP1251;
+ case 1252: return WC_CES_CP1252;
+ case 1253: return WC_CES_CP1253;
+ case 1254: return WC_CES_CP1254;
+ case 1255: return WC_CES_CP1255;
+ case 1256: return WC_CES_CP1256;
+ case 1257: return WC_CES_CP1257;
+ case 1258: return WC_CES_CP1258;
+ }
+ break;
+ }
+ return 0;
+}
+
+wc_ces
+wc_charset_short_to_ces(char *charset)
+{
+ char *p = charset;
+ char buf[16];
+ wc_ces ces;
+ int n;
+
+ ces = wc_charset_to_ces(charset);
+ if (ces)
+ return ces;
+
+ for (n = 0; *p && n < 15; p++) {
+ if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
+ buf[n++] = tolower(*p);
+ }
+ buf[n] = 0;
+ p = buf;
+ switch (*p) {
+ case 'e':
+ switch (*(p+1)) {
+ case 'j': return WC_CES_EUC_JP;
+ case 'c': return WC_CES_EUC_CN;
+ case 't': return WC_CES_EUC_TW;
+ case 'k': return WC_CES_EUC_KR;
+ }
+ return WC_CES_EUC_JP;
+ case 'j':
+ p++;
+ if (*p == 'o')
+ return WC_CES_JOHAB;
+ if (*p == 'p')
+ p++;
+ if (*p == '2')
+ return WC_CES_ISO_2022_JP_2;
+ if (*p == '3')
+ return WC_CES_ISO_2022_JP_3;
+ return WC_CES_ISO_2022_JP;
+ case 's':
+ return WC_CES_SHIFT_JIS;
+ case 'g':
+ return WC_CES_EUC_CN;
+ case 'b':
+ return WC_CES_BIG5;
+ case 'h':
+ if (*(p+1) == 'k')
+ return WC_CES_HKSCS;
+ return WC_CES_HZ_GB_2312;
+ case 'k':
+ if (*(p+1) == 'o')
+ return WC_CES_KOI8_R;
+ return WC_CES_ISO_2022_KR;
+ case 'l':
+ n = atoi(p + 1);
+ if (n >= 1 && n <= 16 && n != 12)
+ return (WC_CES_E_ISO_8859 | n);
+ return WC_CES_ISO_8859_1;
+ case 't':
+ if (*(p+1) == 'c')
+ return WC_CES_TCVN_5712;
+ return WC_CES_TIS_620;
+ case 'n':
+ return WC_CES_NEXTSTEP;
+ case 'v':
+ if (*(p+1) == 'p')
+ return WC_CES_VPS;
+ return WC_CES_VISCII_11;
+#ifdef USE_UNICODE
+ case 'u':
+ if (*(p+1) == '7')
+ return WC_CES_UTF_7;
+ return WC_CES_UTF_8;
+#endif
+ case 'a':
+ return WC_CES_US_ASCII;
+ case 'c':
+ return WC_CES_ISO_2022_CN;
+ case 'w':
+ n = atoi(p + 1);
+ switch (n) {
+ case 1250: return WC_CES_CP1250;
+ case 1251: return WC_CES_CP1251;
+ case 1252: return WC_CES_CP1252;
+ case 1253: return WC_CES_CP1253;
+ case 1254: return WC_CES_CP1254;
+ case 1255: return WC_CES_CP1255;
+ case 1256: return WC_CES_CP1256;
+ case 1257: return WC_CES_CP1257;
+ case 1258: return WC_CES_CP1258;
+ }
+ break;
+ case 'r':
+ return WC_CES_RAW;
+ }
+ return 0;
+}
+
+wc_ces
+wc_locale_to_ces(char *locale)
+{
+ char *p = locale;
+ char buf[6];
+ int n;
+
+ if (*p == 'C' && *(p+1) == '\0')
+ return WC_CES_US_ASCII;
+ for (n = 0; *p && *p != '.' && n < 5; p++) {
+ if ((unsigned char)*p > 0x20)
+ buf[n++] = tolower(*p);
+ }
+ buf[n] = 0;
+ if (*p == '.') {
+ p++;
+ if (! strcasecmp(p, "euc")) {
+ switch (buf[0]) {
+ case 'j':
+ WcLocale = WC_LOCALE_JA_JP;
+ break;
+ case 'k':
+ WcLocale = WC_LOCALE_KO_KR;
+ break;
+ case 'z':
+ if (!strcmp(buf, "zh_tw"))
+ WcLocale = WC_LOCALE_ZH_TW;
+ else if (!strcmp(buf, "zh_hk"))
+ WcLocale = WC_LOCALE_ZH_HK;
+ else
+ WcLocale = WC_LOCALE_ZH_CN;
+ break;
+ default:
+ WcLocale = 0;
+ break;
+ }
+ }
+ return wc_charset_to_ces(p);
+ }
+
+ if (!strcmp(buf, "japanese"))
+ return WC_CES_SHIFT_JIS;
+ if (!strcmp(buf, "zh_tw") ||
+ !strcmp(buf, "zh_hk"))
+ return WC_CES_BIG5;
+ for (n = 0; lang_ces_table[n].lang; n++) {
+ if (!strncmp(buf, lang_ces_table[n].lang, 2))
+ return lang_ces_table[n].ces;
+ }
+ return WC_CES_ISO_8859_1;
+}
+
+char *
+wc_ces_to_charset(wc_ces ces)
+{
+ if (ces == WC_CES_WTF)
+ return "WTF";
+ return WcCesInfo[WC_CES_INDEX(ces)].name;
+}
+
+char *
+wc_ces_to_charset_desc(wc_ces ces)
+{
+ if (ces == WC_CES_WTF)
+ return "W3M Transfer Format";
+ return WcCesInfo[WC_CES_INDEX(ces)].desc;
+}
+
+wc_ces
+wc_guess_8bit_charset(wc_ces orig)
+{
+ switch (orig) {
+ case WC_CES_ISO_2022_JP:
+ case WC_CES_ISO_2022_JP_2:
+ case WC_CES_ISO_2022_JP_3:
+ return WC_CES_EUC_JP;
+ case WC_CES_ISO_2022_KR:
+ return WC_CES_EUC_KR;
+ case WC_CES_ISO_2022_CN:
+ case WC_CES_HZ_GB_2312:
+ return WC_CES_EUC_CN;
+ case WC_CES_US_ASCII:
+ return WC_CES_ISO_8859_1;
+ }
+ return orig;
+}
+
+wc_bool
+wc_check_ces(wc_ces ces)
+{
+ size_t i = WC_CES_INDEX(ces);
+
+ return (i <= WC_CES_END && WcCesInfo[i].id == ces);
+}
+
+static int
+wc_ces_list_cmp(const void *a, const void *b)
+{
+ return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc);
+}
+
+static wc_ces_list *list = NULL;
+
+wc_ces_list *
+wc_get_ces_list(void)
+{
+ wc_ces_info *info;
+ size_t n;
+
+ if (list)
+ return list;
+ for (info = WcCesInfo, n = 0; info->id; info++) {
+ if (info->name != NULL)
+ n++;
+ }
+ list = New_N(wc_ces_list, n + 1);
+ for (info = WcCesInfo, n = 0; info->id; info++) {
+ if (info->name != NULL) {
+ list[n].id = info->id;
+ list[n].name = info->name;
+ list[n].desc = info->desc;
+ n++;
+ }
+ }
+ list[n].id = 0;
+ list[n].name = NULL;
+ list[n].desc = NULL;
+ qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp);
+ return list;
+}