diff options
Diffstat (limited to '')
-rw-r--r-- | libwc/charset.c | 492 |
1 files changed, 492 insertions, 0 deletions
diff --git a/libwc/charset.c b/libwc/charset.c new file mode 100644 index 0000000..95343b3 --- /dev/null +++ b/libwc/charset.c @@ -0,0 +1,492 @@ + +#include <stdlib.h> +#include <ctype.h> +#include <gc.h> +#define New_N(type,n) ((type*)GC_MALLOC((n)*sizeof(type))) + +#include "wc.h" + +wc_locale WcLocale = 0; + +static struct { + char *lang; + wc_ces ces; +} lang_ces_table[] = { + { "cs", WC_CES_ISO_8859_2 }, /* cs_CZ */ + { "el", WC_CES_ISO_8859_7 }, /* el_GR */ + { "iw", WC_CES_ISO_8859_8 }, /* iw_IL */ + { "ja", WC_CES_EUC_JP }, /* ja_JP */ + { "ko", WC_CES_EUC_KR }, /* ko_KR */ + { "hu", WC_CES_ISO_8859_2 }, /* hu_HU */ + { "pl", WC_CES_ISO_8859_2 }, /* pl_PL */ + { "ro", WC_CES_ISO_8859_2 }, /* ro_RO */ + { "ru", WC_CES_ISO_8859_5 }, /* ru_SU */ + { "sk", WC_CES_ISO_8859_2 }, /* sk_SK */ + { "sl", WC_CES_ISO_8859_2 }, /* sl_CS */ + { "tr", WC_CES_ISO_8859_9 }, /* tr_TR */ + { "zh", WC_CES_EUC_CN }, /* zh_CN */ + { NULL, 0 } +}; + +wc_ces +wc_guess_charset(char *charset, wc_ces orig) +{ + wc_ces guess; + + if (charset == NULL || *charset == '\0') + return orig; + guess = wc_charset_to_ces(charset); + return guess ? guess : orig; +} + +wc_ces +wc_guess_charset_short(char *charset, wc_ces orig) +{ + wc_ces guess; + + if (charset == NULL || *charset == '\0') + return orig; + guess = wc_charset_short_to_ces(charset); + return guess ? guess : orig; +} + +wc_ces +wc_guess_locale_charset(char *locale, wc_ces orig) +{ + wc_ces guess; + + if (locale == NULL || *locale == '\0') + return orig; + guess = wc_locale_to_ces(locale); + return guess ? guess : orig; +} + +wc_ces +wc_charset_to_ces(char *charset) +{ + char *p = charset; + char buf[16]; + int n; + + if (tolower(*p) == 'x' && *(p+1) == '-') + p += 2; + for (n = 0; *p && n < 15; p++) { + if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-') + buf[n++] = tolower(*p); + } + buf[n] = 0; + p = buf; + switch (*p) { + case 'e': + if (! strncmp(p, "euc", 3)) { + p += 3; + switch (*p) { + case 'j': return WC_CES_EUC_JP; + case 'c': return WC_CES_EUC_CN; + case 't': return WC_CES_EUC_TW; + case 'k': return WC_CES_EUC_KR; + } + switch (WcLocale) { + case WC_LOCALE_JA_JP: return WC_CES_EUC_JP; + case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN; + case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW; + case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN; + case WC_LOCALE_KO_KR: return WC_CES_EUC_KR; + } + return WC_CES_EUC_JP; + } + break; + case 'i': + if (! strncmp(p, "iso2022", 7)) { + p += 7; + switch (*p) { + case 'j': + if (! strncmp(p, "jp2", 3)) + return WC_CES_ISO_2022_JP_2; + if (! strncmp(p, "jp3", 3)) + return WC_CES_ISO_2022_JP_3; + return WC_CES_ISO_2022_JP; + case 'c': return WC_CES_ISO_2022_CN; + case 'k': return WC_CES_ISO_2022_KR; + } + return WC_CES_ISO_2022_JP; + } else if (! strncmp(p, "iso8859", 7)) { + n = atoi(p + 7); + if (n >= 1 && n <= 16 && n != 12) + return (WC_CES_E_ISO_8859 | n); + return WC_CES_ISO_8859_1; + } + break; + case 'j': + if (! strncmp(p, "johab", 5)) + return WC_CES_JOHAB; + if (! strncmp(p, "jis", 3)) + return WC_CES_ISO_2022_JP; + break; + case 's': + if (! strncmp(p, "shiftjisx0213", 13) || + ! strncmp(p, "sjisx0213", 9)) + return WC_CES_SHIFT_JISX0213; + if (! strncmp(p, "shiftjis", 8) || + ! strncmp(p, "sjis", 4)) + return WC_CES_SHIFT_JIS; + break; + case 'g': + if (! strncmp(p, "gb18030", 7) || + ! strncmp(p, "gbk2k", 5)) + return WC_CES_GB18030; + if (! strncmp(p, "gbk", 3)) + return WC_CES_GBK; + if (! strncmp(p, "gb2312", 6)) + return WC_CES_EUC_CN; + break; + case 'b': + if (! strncmp(p, "big5hkscs", 9)) + return WC_CES_HKSCS; + if (! strncmp(p, "big5", 4)) + return WC_CES_BIG5; + break; + case 'h': + if (! strncmp(p, "hz", 2)) + return WC_CES_HZ_GB_2312; + if (! strncmp(p, "hkscs", 5)) + return WC_CES_HKSCS; + break; + case 'k': + if (! strncmp(p, "koi8r", 5)) + return WC_CES_KOI8_R; + if (! strncmp(p, "koi8u", 5)) + return WC_CES_KOI8_U; + if (! strncmp(p, "ksx1001", 7)) + return WC_CES_EUC_KR; + if (! strncmp(p, "ksc5601", 7)) + return WC_CES_EUC_KR; + break; + case 't': + if (! strncmp(p, "tis620", 6)) + return WC_CES_TIS_620; + if (! strncmp(p, "tcvn", 4)) + return WC_CES_TCVN_5712; + break; + case 'n': + if (! strncmp(p, "next", 4)) + return WC_CES_NEXTSTEP; + break; + case 'v': + if (! strncmp(p, "viet", 4)) { + p += 4; + if (! strncmp(p, "tcvn", 4)) + return WC_CES_TCVN_5712; + } + if (! strncmp(p, "viscii", 6)) + return WC_CES_VISCII_11; + if (! strncmp(p, "vps", 3)) + return WC_CES_VPS; + break; + case 'u': +#ifdef USE_UNICODE + if (! strncmp(p, "utf8", 4)) + return WC_CES_UTF_8; + if (! strncmp(p, "utf7", 4)) + return WC_CES_UTF_7; +#endif + if (! strncmp(p, "uhc", 3)) + return WC_CES_UHC; + if (! strncmp(p, "ujis", 4)) + return WC_CES_EUC_JP; + if (! strncmp(p, "usascii", 7)) + return WC_CES_US_ASCII; + break; + case 'a': + if (! strncmp(p, "ascii", 5)) + return WC_CES_US_ASCII; + break; + case 'c': + if (! strncmp(p, "cngb", 4)) + return WC_CES_EUC_CN; + if (*(p+1) != 'p') + break; + n = atoi(p + 2); + switch (n) { + case 437: return WC_CES_CP437; + case 737: return WC_CES_CP737; + case 775: return WC_CES_CP775; + case 850: return WC_CES_CP850; + case 852: return WC_CES_CP852; + case 855: return WC_CES_CP855; + case 856: return WC_CES_CP856; + case 857: return WC_CES_CP857; + case 860: return WC_CES_CP860; + case 861: return WC_CES_CP861; + case 862: return WC_CES_CP862; + case 863: return WC_CES_CP863; + case 864: return WC_CES_CP864; + case 865: return WC_CES_CP865; + case 866: return WC_CES_CP866; + case 869: return WC_CES_CP869; + case 874: return WC_CES_CP874; + case 932: return WC_CES_CP932; /* CP932 = Shift_JIS */ + case 936: return WC_CES_CP936; /* CP936 = GBK > EUC_CN */ + case 949: return WC_CES_CP949; /* CP949 = UHC > EUC_KR */ + case 950: return WC_CES_CP950; /* CP950 = Big5 */ + case 1006: return WC_CES_CP1006; + case 1250: return WC_CES_CP1250; + case 1251: return WC_CES_CP1251; + case 1252: return WC_CES_CP1252; + case 1253: return WC_CES_CP1253; + case 1254: return WC_CES_CP1254; + case 1255: return WC_CES_CP1255; + case 1256: return WC_CES_CP1256; + case 1257: return WC_CES_CP1257; + case 1258: return WC_CES_CP1258; + } + break; + case 'w': + if (strncmp(p, "windows", 7)) + break; + if (! strncmp(p, "31j", 3)) + return WC_CES_CP932; + n = atoi(p + 7); + switch (n) { + case 1250: return WC_CES_CP1250; + case 1251: return WC_CES_CP1251; + case 1252: return WC_CES_CP1252; + case 1253: return WC_CES_CP1253; + case 1254: return WC_CES_CP1254; + case 1255: return WC_CES_CP1255; + case 1256: return WC_CES_CP1256; + case 1257: return WC_CES_CP1257; + case 1258: return WC_CES_CP1258; + } + break; + } + return 0; +} + +wc_ces +wc_charset_short_to_ces(char *charset) +{ + char *p = charset; + char buf[16]; + wc_ces ces; + int n; + + ces = wc_charset_to_ces(charset); + if (ces) + return ces; + + for (n = 0; *p && n < 15; p++) { + if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-') + buf[n++] = tolower(*p); + } + buf[n] = 0; + p = buf; + switch (*p) { + case 'e': + switch (*(p+1)) { + case 'j': return WC_CES_EUC_JP; + case 'c': return WC_CES_EUC_CN; + case 't': return WC_CES_EUC_TW; + case 'k': return WC_CES_EUC_KR; + } + return WC_CES_EUC_JP; + case 'j': + p++; + if (*p == 'o') + return WC_CES_JOHAB; + if (*p == 'p') + p++; + if (*p == '2') + return WC_CES_ISO_2022_JP_2; + if (*p == '3') + return WC_CES_ISO_2022_JP_3; + return WC_CES_ISO_2022_JP; + case 's': + return WC_CES_SHIFT_JIS; + case 'g': + return WC_CES_EUC_CN; + case 'b': + return WC_CES_BIG5; + case 'h': + if (*(p+1) == 'k') + return WC_CES_HKSCS; + return WC_CES_HZ_GB_2312; + case 'k': + if (*(p+1) == 'o') + return WC_CES_KOI8_R; + return WC_CES_ISO_2022_KR; + case 'l': + n = atoi(p + 1); + if (n >= 1 && n <= 16 && n != 12) + return (WC_CES_E_ISO_8859 | n); + return WC_CES_ISO_8859_1; + case 't': + if (*(p+1) == 'c') + return WC_CES_TCVN_5712; + return WC_CES_TIS_620; + case 'n': + return WC_CES_NEXTSTEP; + case 'v': + if (*(p+1) == 'p') + return WC_CES_VPS; + return WC_CES_VISCII_11; +#ifdef USE_UNICODE + case 'u': + if (*(p+1) == '7') + return WC_CES_UTF_7; + return WC_CES_UTF_8; +#endif + case 'a': + return WC_CES_US_ASCII; + case 'c': + return WC_CES_ISO_2022_CN; + case 'w': + n = atoi(p + 1); + switch (n) { + case 1250: return WC_CES_CP1250; + case 1251: return WC_CES_CP1251; + case 1252: return WC_CES_CP1252; + case 1253: return WC_CES_CP1253; + case 1254: return WC_CES_CP1254; + case 1255: return WC_CES_CP1255; + case 1256: return WC_CES_CP1256; + case 1257: return WC_CES_CP1257; + case 1258: return WC_CES_CP1258; + } + break; + case 'r': + return WC_CES_RAW; + } + return 0; +} + +wc_ces +wc_locale_to_ces(char *locale) +{ + char *p = locale; + char buf[6]; + int n; + + if (*p == 'C' && *(p+1) == '\0') + return WC_CES_US_ASCII; + for (n = 0; *p && *p != '.' && n < 5; p++) { + if ((unsigned char)*p > 0x20) + buf[n++] = tolower(*p); + } + buf[n] = 0; + if (*p == '.') { + p++; + if (! strcasecmp(p, "euc")) { + switch (buf[0]) { + case 'j': + WcLocale = WC_LOCALE_JA_JP; + break; + case 'k': + WcLocale = WC_LOCALE_KO_KR; + break; + case 'z': + if (!strcmp(buf, "zh_tw")) + WcLocale = WC_LOCALE_ZH_TW; + else if (!strcmp(buf, "zh_hk")) + WcLocale = WC_LOCALE_ZH_HK; + else + WcLocale = WC_LOCALE_ZH_CN; + break; + default: + WcLocale = 0; + break; + } + } + return wc_charset_to_ces(p); + } + + if (!strcmp(buf, "japanese")) + return WC_CES_SHIFT_JIS; + if (!strcmp(buf, "zh_tw") || + !strcmp(buf, "zh_hk")) + return WC_CES_BIG5; + for (n = 0; lang_ces_table[n].lang; n++) { + if (!strncmp(buf, lang_ces_table[n].lang, 2)) + return lang_ces_table[n].ces; + } + return WC_CES_ISO_8859_1; +} + +char * +wc_ces_to_charset(wc_ces ces) +{ + if (ces == WC_CES_WTF) + return "WTF"; + return WcCesInfo[WC_CES_INDEX(ces)].name; +} + +char * +wc_ces_to_charset_desc(wc_ces ces) +{ + if (ces == WC_CES_WTF) + return "W3M Transfer Format"; + return WcCesInfo[WC_CES_INDEX(ces)].desc; +} + +wc_ces +wc_guess_8bit_charset(wc_ces orig) +{ + switch (orig) { + case WC_CES_ISO_2022_JP: + case WC_CES_ISO_2022_JP_2: + case WC_CES_ISO_2022_JP_3: + return WC_CES_EUC_JP; + case WC_CES_ISO_2022_KR: + return WC_CES_EUC_KR; + case WC_CES_ISO_2022_CN: + case WC_CES_HZ_GB_2312: + return WC_CES_EUC_CN; + case WC_CES_US_ASCII: + return WC_CES_ISO_8859_1; + } + return orig; +} + +wc_bool +wc_check_ces(wc_ces ces) +{ + size_t i = WC_CES_INDEX(ces); + + return (i <= WC_CES_END && WcCesInfo[i].id == ces); +} + +static int +wc_ces_list_cmp(const void *a, const void *b) +{ + return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc); +} + +static wc_ces_list *list = NULL; + +wc_ces_list * +wc_get_ces_list(void) +{ + wc_ces_info *info; + size_t n; + + if (list) + return list; + for (info = WcCesInfo, n = 0; info->id; info++) { + if (info->name != NULL) + n++; + } + list = New_N(wc_ces_list, n + 1); + for (info = WcCesInfo, n = 0; info->id; info++) { + if (info->name != NULL) { + list[n].id = info->id; + list[n].name = info->name; + list[n].desc = info->desc; + n++; + } + } + list[n].id = 0; + list[n].name = NULL; + list[n].desc = NULL; + qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp); + return list; +} |