diff options
author | Akinori Ito <aito@eie.yz.yamagata-u.ac.jp> | 2001-11-08 05:14:08 +0000 |
---|---|---|
committer | Akinori Ito <aito@eie.yz.yamagata-u.ac.jp> | 2001-11-08 05:14:08 +0000 |
commit | 68a07bf03b7624c9924065cce9ffa45497225834 (patch) | |
tree | c2adb06a909a8594445e4a3f8587c4bad46e3ecd /conv.c | |
download | w3m-68a07bf03b7624c9924065cce9ffa45497225834.tar.gz w3m-68a07bf03b7624c9924065cce9ffa45497225834.zip |
Initial revision
Diffstat (limited to '')
-rw-r--r-- | conv.c | 699 |
1 files changed, 699 insertions, 0 deletions
@@ -0,0 +1,699 @@ +#include <stdio.h> +#include <string.h> +#include "fm.h" + +#ifdef JP_CHARSET +#include "terms.h" +#include "Str.h" + +#ifdef DEBUG +#include <malloc.h> +#endif /* DEBUG */ + +#define uchar unsigned char +#define ushort unsigned short +#define uint unsigned int + +#ifdef TRUE +#undef TRUE +#endif +#ifdef FALSE +#undef FALSE +#endif +#define TRUE 1 +#define FALSE 0 +#ifdef ESC_CODE +#undef ESC_CODE +#endif +#define ESC_CODE '\033' + +#define CODE_NORMAL 0x00 +#define CODE_OK 0x01 +#define CODE_BROKEN 0x02 +#define CODE_ERROR 0x04 +#define EUC_NOSTATE 0x00 +#define EUC_MBYTE1 0x10 +#define EUC_SS2 0x20 +#define EUC_SS3 0x40 +#define SJIS_NOSTATE 0x00 +#define SJIS_SHIFT_L 0x10 +#define SJIS_SHIFT_H 0x20 +#define ISO_NOSTATE 0x00 +#define ISO_ESC 0x10 +#define ISO_CS94 0x20 +#define ISO_MBCS 0x40 +#define ISO_MBYTE1 0x80 +#define CODE_STATE(c) ((c) & 0x0f) +#define EUC_STATE(c) ((c) & 0xf0) +#define SJIS_STATE(c) ((c) & 0xf0) +#define ISO_STATE(c) ((c) & 0xf0) + +#define CSET_ASCII 0 +#define CSET_X0208 1 +#define CSET_X0201K 2 +#define CSET_UNKNOWN 3 + +#define JSIcode "\033$@" +#define JSOcode "\033(H" +#define J2SIcode "\033$@" +#define J2SOcode "\033(J" +#define NSIcode "\033$B" +#define NSOcode "\033(J" +#define N2SIcode "\033$B" +#define N2SOcode "\033(B" +#define N3SIcode "\033$@" +#define N3SOcode "\033(B" +#define USIcode "\033$" +#define USOcode "\033+" + +static char *SIcode, *SOcode; + +static Str cConvEE(Str is); +static Str cConvEJ(Str is); +static Str cConvES(Str is); +static Str cConvSE(Str is); +static Str cConvJE(Str is); +char checkShiftCode(Str buf, uchar); + +static char *han2zen_tab[] = +{ + "!!", "!#", "!V", "!W", "!\"", "!&", "%r", "%!", + "%#", "%%", "%'", "%)", "%c", "%e", "%g", "%C", + "!<", "%\"", "%$", "%&", "%(", "%*", "%+", "%-", + "%/", "%1", "%3", "%5", "%7", "%9", "%;", "%=", + "%?", "%A", "%D", "%F", "%H", "%J", "%K", "%L", + "%M", "%N", "%O", "%R", "%U", "%X", "%[", "%^", + "%_", "%`", "%a", "%b", "%d", "%f", "%h", "%i", + "%j", "%k", "%l", "%m", "%o", "%s", "!+", "!,", +}; + +typedef struct _ConvRoutine { + char key; + Str(*routine) (); + char *ShiftIn, *ShiftOut; +} ConvRoutine; + +static ConvRoutine FromEJ[] = +{ + {CODE_JIS_J, cConvEJ, JSIcode, JSOcode}, + {CODE_JIS_N, cConvEJ, NSIcode, NSOcode}, + {CODE_JIS_n, cConvEJ, N2SIcode, N2SOcode}, + {CODE_JIS_m, cConvEJ, N3SIcode, N3SOcode}, + {CODE_JIS_j, cConvEJ, J2SIcode, J2SOcode}, + {CODE_SJIS, cConvES, "", ""}, + {CODE_EUC, cConvEE, "", ""}, + {'\0', NULL, NULL, NULL} +}; + +static ConvRoutine ToEJ[] = +{ + {CODE_JIS_J, cConvJE, JSIcode, JSOcode}, + {CODE_JIS_N, cConvJE, NSIcode, NSOcode}, + {CODE_JIS_n, cConvJE, N2SIcode, N2SOcode}, + {CODE_JIS_m, cConvJE, N3SIcode, N3SOcode}, + {CODE_JIS_j, cConvJE, J2SIcode, J2SOcode}, + {CODE_SJIS, cConvSE, "", ""}, + {CODE_EUC, cConvEE, "", ""}, + {'\0', NULL, NULL, NULL} +}; + +char * +GetSICode(char key) +{ + int i; + for (i = 0; FromEJ[i].key != '\0' ; i++) + if (FromEJ[i].key == key) + return FromEJ[i].ShiftIn; + return ""; +} + +char * +GetSOCode(char key) +{ + int i; + for (i = 0; FromEJ[i].key != '\0'; i++) + if (FromEJ[i].key == key) + return FromEJ[i].ShiftOut; + return ""; +} + +static void +n_impr(char s) +{ + fprintf(stderr, "conv: option %c(0x%02x) is not implemented yet... sorry\n", s, s); + exit(1); +} + +Str +conv_str(Str is, char fc, char tc) +{ + int i; + Str os; + static char from_code = '\0'; + static char to_code = '\0'; + static Str (*conv_from) (); + static Str (*conv_to) (); + + if (fc == tc || fc == CODE_ASCII || tc == CODE_ASCII) + return is; + + if (fc == CODE_INNER_EUC) + os = is; + else { + if (from_code != fc) { + for (i = 0; ToEJ[i].key != '\0'; i++) { + if (ToEJ[i].key == fc) { + from_code = fc; + conv_from = *ToEJ[i].routine; + goto next; + } + } + n_impr(fc); + return NULL; + } + next: + os = conv_from(is); + } + if (tc == CODE_INNER_EUC || tc == CODE_EUC) + return os; + else { + if (to_code != tc) { + for (i = 0; FromEJ[i].key != '\0'; i++) { + if (FromEJ[i].key == tc) { + SIcode = FromEJ[i].ShiftIn; + SOcode = FromEJ[i].ShiftOut; + to_code = tc; + conv_to = *FromEJ[i].routine; + goto next2; + } + } + n_impr(tc); + return NULL; + } + next2: + return conv_to(os); + } +} + +Str +conv(char *is, char fc, char tc) +{ + return conv_str(Strnew_charp(is), fc, tc); +} + +static uchar +getSLb(uchar * ptr, uchar * ub) +{ /* Get Shift-JIS Lower byte */ + uchar c = *ptr; + + *ub <<= 1; + if (c < 0x9f) { + if (c > 0x7e) + c--; + *ub -= 1; + c -= 0x3f; + } + else { + c -= 0x9e; + } + return c; +} + +static Str +cConvSE(Str is) +{ /* Convert Shift-JIS to EUC-JP */ + uchar *p, ub, lb; + int state = SJIS_NOSTATE; + Str os = Strnew_size(is->length); + uchar *endp = (uchar *) &is->ptr[is->length]; + + for (p = (uchar *) is->ptr; p < endp; p++) { + switch (state) { + case SJIS_NOSTATE: + if (!(*p & 0x80)) /* ASCII */ + Strcat_char(os, (char) (*p)); + else if (0x81 <= *p && *p <= 0x9f) { /* JIS X 0208, + * 0213 */ + ub = *p & 0x7f; + state = SJIS_SHIFT_L; + } + else if (0xe0 <= *p && *p <= 0xef) { /* JIS X 0208 */ + /* } else if (0xe0 <= *p && *p <= 0xfc) { *//* JIS X 0213 */ + ub = (*p & 0x7f) - 0x40; + state = SJIS_SHIFT_H; + } + else if (0xa0 <= *p && *p <= 0xdf) { /* JIS X 0201-Kana + */ + Strcat_char(os, (char) (han2zen_tab[*p - 0xa0][0] | 0x80)); + Strcat_char(os, (char) (han2zen_tab[*p - 0xa0][1] | 0x80)); + } + break; + case SJIS_SHIFT_L: + case SJIS_SHIFT_H: + if ((0x40 <= *p && *p <= 0x7e) || + (0x80 <= *p && *p <= 0xfc)) { /* JIS X 0208, 0213 */ + lb = getSLb(p, &ub); + ub += 0x20; + lb += 0x20; + Strcat_char(os, (char) (ub | 0x80)); + Strcat_char(os, (char) (lb | 0x80)); + } + else if (!(*p & 0x80)) /* broken ? */ + Strcat_char(os, (char) (*p)); + state = SJIS_NOSTATE; + break; + } + } + return os; +} + +static Str +cConvJE(Str is) +{ /* Convert ISO-2022-JP to EUC-JP */ + uchar *p, ub; + char cset = CSET_ASCII; + int state = ISO_NOSTATE; + Str os = Strnew_size(is->length); + uchar *endp = (uchar *) &is->ptr[is->length]; + + for (p = (uchar *) is->ptr; p < endp; p++) { + switch (state) { + case ISO_NOSTATE: + if (*p == ESC_CODE) /* ESC sequence */ + state = ISO_ESC; + else if (cset == CSET_ASCII || *p < 0x21) + Strcat_char(os, (char) (*p)); + else if (cset == CSET_X0208 && *p <= 0x7e) { + /* JIS X 0208 */ + ub = *p; + state = ISO_MBYTE1; + } + else if (cset == CSET_X0201K && *p <= 0x5f) { + /* JIS X 0201-Kana */ + Strcat_char(os, (char) (han2zen_tab[*p - 0x20][0] | 0x80)); + Strcat_char(os, (char) (han2zen_tab[*p - 0x20][1] | 0x80)); + } + break; + case ISO_MBYTE1: + if (*p == ESC_CODE) /* ESC sequence */ + state = ISO_ESC; + else if (0x21 <= *p && *p <= 0x7e) { /* JIS X 0208 */ + Strcat_char(os, (char) (ub | 0x80)); + Strcat_char(os, (char) (*p | 0x80)); + state = ISO_NOSTATE; + } + else { + Strcat_char(os, (char) (*p)); + state = ISO_NOSTATE; + } + break; + case ISO_ESC: + if (*p == '(') /* ESC ( F */ + state = ISO_CS94; + else if (*p == '$') /* ESC $ F, ESC $ ( F */ + state = ISO_MBCS; + else { + Strcat_char(os, ESC_CODE); + Strcat_char(os, (char) (*p)); + state = ISO_NOSTATE; + } + break; + case ISO_CS94: + if (*p == 'B' || *p == 'J' || *p == 'H') + cset = CSET_ASCII; + else if (*p == 'I') + cset = CSET_X0201K; + else { + Strcat_char(os, ESC_CODE); + Strcat_char(os, '('); + Strcat_char(os, (char) (*p)); + } + state = ISO_NOSTATE; + break; + case ISO_MBCS: + if (*p == '(') { /* ESC $ ( F */ + state = ISO_MBCS | ISO_CS94; + break; + } + case ISO_MBCS | ISO_CS94: + if (*p == 'B' || *p == '@') + cset = CSET_X0208; + else { + Strcat_char(os, ESC_CODE); + Strcat_char(os, '$'); + if (state == (ISO_MBCS | ISO_CS94)) + Strcat_char(os, '('); + Strcat_char(os, (char) (*p)); + } + state = ISO_NOSTATE; + break; + } + } + return os; +} + +static Str +_cConvEE(Str is, char is_euc) +{ /* Convert EUC-JP to EUC-JP / ISO-2022-JP + * (no JIS X 0201-Kana, 0212, 0213-2) */ + uchar *p, ub, euc = 0; + int state = EUC_NOSTATE; + char cset = CSET_ASCII; + Str os; + uchar *endp = (uchar *) &is->ptr[is->length]; + + if (is_euc) { + os = Strnew_size(is->length); + euc = 0x80; + } + else + os = Strnew_size(is->length * 3 / 2); + + for (p = (uchar *) is->ptr; p < endp; p++) { + switch (state) { + case EUC_NOSTATE: + if (!(*p & 0x80)) { /* ASCII */ + if (!is_euc && cset != CSET_ASCII) { + Strcat_charp(os, SOcode); + cset = CSET_ASCII; + } + Strcat_char(os, (char) (*p)); + } + else if (0xa1 <= *p && *p <= 0xfe) { /* JIS X 0208, + * 0213-1 */ + ub = *p; + state = EUC_MBYTE1; + } + else if (*p == EUC_SS2_CODE) /* SS2 + JIS X 0201-Kana */ + state = EUC_SS2; + else if (*p == EUC_SS3_CODE) /* SS3 + JIS X 0212, 0213-2 */ + state = EUC_SS3; + break; + case EUC_MBYTE1: + if (0xa1 <= *p && *p <= 0xfe) { /* JIS X 0208, 0213-1 */ + if (!is_euc && cset != CSET_X0208) { + Strcat_charp(os, SIcode); + cset = CSET_X0208; + } + Strcat_char(os, (char) ((ub & 0x7f) | euc)); + Strcat_char(os, (char) ((*p & 0x7f) | euc)); + } + else if (!(*p & 0x80)) { /* broken ? */ + if (!is_euc && cset != CSET_ASCII) { + Strcat_charp(os, SOcode); + cset = CSET_ASCII; + } + Strcat_char(os, (char) (*p)); + } + state = EUC_NOSTATE; + break; + case EUC_SS2: + if (0xa0 <= *p && *p <= 0xdf) { /* JIS X 0201-Kana */ + if (!is_euc && cset != CSET_X0208) { + Strcat_charp(os, SIcode); + cset = CSET_X0208; + } + Strcat_char(os, (char) (han2zen_tab[*p - 0xa0][0] | euc)); + Strcat_char(os, (char) (han2zen_tab[*p - 0xa0][1] | euc)); + } + state = EUC_NOSTATE; + break; + case EUC_SS3: + state = (EUC_SS3 | EUC_MBYTE1); + break; + case EUC_SS3 | EUC_MBYTE1: + state = EUC_NOSTATE; + break; + } + } + if (!is_euc && cset != CSET_ASCII) + Strcat_charp(os, SOcode); + return os; +} + +static Str +cConvEE(Str is) +{ + return _cConvEE(is, TRUE); +} + +static Str +cConvEJ(Str is) +{ + return _cConvEE(is, FALSE); +} + +void +put_sjis(Str os, uchar ub, uchar lb) +{ + ub -= 0x20; + lb -= 0x20; + if ((ub & 1) == 0) + lb += 94; + ub = ((ub - 1) >> 1) + 0x81; + lb += 0x3f; + if (ub > 0x9f) + ub += 0x40; + if (lb > 0x7e) + lb++; + + Strcat_char(os, (char) (ub)); + Strcat_char(os, (char) (lb)); +} + +static Str +cConvES(Str is) +{ /* Convert EUC-JP to Shift-JIS */ + uchar *p, ub; + int state = EUC_NOSTATE; + Str os = Strnew_size(is->length); + uchar *endp = (uchar *) &is->ptr[is->length]; + + for (p = (uchar *) is->ptr; p < endp; p++) { + switch (state) { + case EUC_NOSTATE: + if (!(*p & 0x80)) /* ASCII */ + Strcat_char(os, (char) (*p)); + else if (0xa1 <= *p && *p <= 0xfe) { /* JIS X 0208, + * 0213-1 */ + ub = *p; + state = EUC_MBYTE1; + } + else if (*p == EUC_SS2_CODE) /* SS2 + JIS X 0201-Kana */ + state = EUC_SS2; + else if (*p == EUC_SS3_CODE) /* SS3 + JIS X 0212, 0213-2 */ + state = EUC_SS3; + break; + case EUC_MBYTE1: + if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0208, 0213-1 */ + put_sjis(os, ub & 0x7f, *p & 0x7f); + else if (!(*p & 0x80)) /* broken ? */ + Strcat_char(os, (char) (*p)); + state = EUC_NOSTATE; + break; + case EUC_SS2: + if (0xa0 <= *p && *p <= 0xdf) /* JIS X 0201-Kana */ + put_sjis(os, han2zen_tab[*p - 0xa0][0], + han2zen_tab[*p - 0xa0][1]); + state = EUC_NOSTATE; + break; + case EUC_SS3: + state = (EUC_SS3 | EUC_MBYTE1); + break; + case EUC_SS3 | EUC_MBYTE1: + state = EUC_NOSTATE; + break; + } + } + return os; +} + +/* + * static ushort sjis_shift[8] = { 0x7fff, 0xffff, 0x0, 0x0, 0x0, + * 0x0, 0xffff, 0x0 }; static ushort sjis_second[16] = { 0x0, 0x0, + * 0x0, 0x0, 0xffff, 0xffff, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, + * 0xffff, 0xffff, 0xffff, 0xffff, 0xfff8 }; */ + +char +checkShiftCode(Str buf, uchar hint) +{ + uchar *p, si = '\0', so = '\0'; + int euc = (CODE_NORMAL | EUC_NOSTATE), + sjis = (CODE_NORMAL | SJIS_NOSTATE), sjis_kana = CODE_NORMAL, + iso = (CODE_NORMAL | ISO_NOSTATE), iso_kana = CODE_NORMAL; + uchar *endp = (uchar *) &buf->ptr[buf->length]; + + if (hint == CODE_INNER_EUC) + return '\0'; + p = (uchar *) buf->ptr; + while (1) { + if (iso != CODE_ERROR && (si == '\0' || so == '\0')) { + switch (ISO_STATE(iso)) { + case ISO_NOSTATE: + if (*p == ESC_CODE) /* ESC sequence */ + iso = (CODE_STATE(iso) | ISO_ESC); + break; + case ISO_ESC: + if (*p == '(') /* ESC ( F */ + iso = (CODE_STATE(iso) | ISO_CS94); + else if (*p == '$') /* ESC $ F, ESC $ ( F */ + iso = (CODE_STATE(iso) | ISO_MBCS); + else + iso = (CODE_STATE(iso) | ISO_NOSTATE); + break; + case ISO_CS94: + if (*p == 'B' || *p == 'J' || *p == 'H') + so = *p; + else if (*p == 'I') + iso_kana = CODE_OK; + iso = (CODE_STATE(iso) | ISO_NOSTATE); + break; + case ISO_MBCS: + if (*p == '(') { /* ESC $ ( F */ + iso = (CODE_STATE(iso) | ISO_MBCS | ISO_CS94); + break; + } + case ISO_MBCS | ISO_CS94: + if (*p == 'B' || *p == '@') + si = *p; + iso = (CODE_STATE(iso) | ISO_NOSTATE); + break; + } + if (*p & 0x80) + iso = CODE_ERROR; + } + if (euc != CODE_ERROR) { + switch (EUC_STATE(euc)) { + case EUC_NOSTATE: + if (!(*p & 0x80)) /* ASCII */ + ; + else if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0208, + * 0213-1 */ + euc = (CODE_STATE(euc) | EUC_MBYTE1); + else if (*p == EUC_SS2_CODE) /* SS2 + JIS X 0201-Kana */ + euc = (CODE_STATE(euc) | EUC_SS2); + else if (*p == EUC_SS3_CODE) /* SS3 + JIS X 0212, 0213-2 */ + euc = (CODE_STATE(euc) | EUC_SS3); + else + euc = CODE_ERROR; + break; + case EUC_MBYTE1: + if (CODE_STATE(euc) == CODE_NORMAL) + euc = CODE_OK; + case EUC_SS3 | EUC_MBYTE1: + if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0208, 0213-1 */ + euc = (CODE_STATE(euc) | EUC_NOSTATE); + else if (euc & CODE_BROKEN) + euc = CODE_ERROR; + else + euc = (CODE_BROKEN | EUC_NOSTATE); + break; + case EUC_SS2: + if (0xa0 <= *p && *p <= 0xdf) /* JIS X 0201-Kana */ + euc = (CODE_STATE(euc) | EUC_NOSTATE); + else + euc = CODE_ERROR; + break; + case EUC_SS3: + if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0212, 0213-2 */ + euc = (CODE_STATE(euc) | EUC_SS3 | EUC_MBYTE1); + else + euc = CODE_ERROR; + break; + } + } + if (sjis != CODE_ERROR) { + switch (SJIS_STATE(sjis)) { + case SJIS_NOSTATE: + if (!(*p & 0x80)) /* ASCII */ + ; + else if (0x81 <= *p && *p <= 0x9f) + sjis = (CODE_STATE(sjis) | SJIS_SHIFT_L); + else if (0xe0 <= *p && *p <= 0xef) /* JIS X 0208 */ + /* else if (0xe0 <= *p && *p <= 0xfc) */ + /* JIS X 0213 */ + sjis = (CODE_STATE(sjis) | SJIS_SHIFT_H); + else if (0xa0 == *p) + sjis = (CODE_BROKEN | SJIS_NOSTATE); + else if (0xa1 <= *p && *p <= 0xdf) /* JIS X 0201-Kana + */ + sjis_kana = CODE_OK; + else + sjis = CODE_ERROR; + break; + case SJIS_SHIFT_L: + case SJIS_SHIFT_H: + if (CODE_STATE(sjis) == CODE_NORMAL) + sjis = CODE_OK; + if ((0x40 <= *p && *p <= 0x7e) || + (0x80 <= *p && *p <= 0xfc)) /* JIS X 0208, + * 0213 */ + sjis = (CODE_STATE(sjis) | SJIS_NOSTATE); + else if (sjis & CODE_BROKEN) + sjis = CODE_ERROR; + else + sjis = (CODE_BROKEN | SJIS_NOSTATE); + break; + } + } + if (euc == CODE_ERROR || sjis == CODE_ERROR) + break; + if (p == endp) + break; + p++; + } + if (iso != CODE_ERROR) { + if (si == '\0' && so == '\0' && iso_kana != CODE_OK) + return '\0'; + switch (si) { + case '@': + switch (so) { + case 'H': + return CODE_JIS_J; + case 'J': + return CODE_JIS_j; + case 'B': + return CODE_JIS_m; + default: + return CODE_JIS_m; + } + case 'B': + switch (so) { + case 'J': + return CODE_JIS_N; + case 'B': + return CODE_JIS_n; + default: + return CODE_JIS_n; + } + default: + switch (so) { + case 'H': + return CODE_JIS_J; + case 'J': + return CODE_JIS_N; + case 'B': + return CODE_JIS_n; + default: + return CODE_JIS_n; + } + } + } + if (hint == CODE_EUC) { + if (euc != CODE_ERROR) + return CODE_EUC; + } else if (hint == CODE_SJIS) { + if (sjis != CODE_ERROR) + return CODE_SJIS; + } + if (CODE_STATE(euc) == CODE_OK) + return CODE_EUC; + if (CODE_STATE(sjis) == CODE_OK) + return CODE_SJIS; + if (CODE_STATE(euc) == CODE_NORMAL) + return CODE_EUC; + if (CODE_STATE(sjis) == CODE_NORMAL) + return CODE_SJIS; + return CODE_EUC; +} +#endif /* JP_CHARSET */ |