/* $Id: conv.c,v 1.7 2002/09/24 16:35:02 ukai Exp $ */
#include <stdio.h>
#include <string.h>
#include "fm.h"
#ifdef JP_CHARSET
#include "terms.h"
#include "Str.h"
#ifdef DEBUG
#include <malloc.h>
#endif /* DEBUG */
#define uchar unsigned char
#define ushort unsigned short
#define uint unsigned int
#ifdef TRUE
#undef TRUE
#endif
#ifdef FALSE
#undef FALSE
#endif
#define TRUE 1
#define FALSE 0
#ifdef ESC_CODE
#undef ESC_CODE
#endif
#define ESC_CODE '\033'
#define CODE_NORMAL 0x00
#define CODE_OK 0x01
#define CODE_BROKEN 0x02
#define CODE_ERROR 0x04
#define EUC_NOSTATE 0x00
#define EUC_MBYTE1 0x10
#define EUC_SS2 0x20
#define EUC_SS3 0x40
#define SJIS_NOSTATE 0x00
#define SJIS_SHIFT_L 0x10
#define SJIS_SHIFT_H 0x20
#define ISO_NOSTATE 0x00
#define ISO_ESC 0x10
#define ISO_CS94 0x20
#define ISO_MBCS 0x40
#define ISO_MBYTE1 0x80
#define CODE_STATE(c) ((c) & 0x0f)
#define EUC_STATE(c) ((c) & 0xf0)
#define SJIS_STATE(c) ((c) & 0xf0)
#define ISO_STATE(c) ((c) & 0xf0)
#define CSET_ASCII 0
#define CSET_X0208 1
#define CSET_X0201K 2
#define CSET_UNKNOWN 3
#define JSIcode "\033$@"
#define JSOcode "\033(H"
#define J2SIcode "\033$@"
#define J2SOcode "\033(J"
#define NSIcode "\033$B"
#define NSOcode "\033(J"
#define N2SIcode "\033$B"
#define N2SOcode "\033(B"
#define N3SIcode "\033$@"
#define N3SOcode "\033(B"
#define USIcode "\033$"
#define USOcode "\033+"
static char *SIcode, *SOcode;
static Str cConvEE(Str is);
static Str cConvEJ(Str is);
static Str cConvES(Str is);
static Str cConvSE(Str is);
static Str cConvJE(Str is);
char checkShiftCode(Str buf, uchar);
static char *han2zen_tab[] = {
"!!", "!#", "!V", "!W", "!\"", "!&", "%r", "%!",
"%#", "%%", "%'", "%)", "%c", "%e", "%g", "%C",
"!<", "%\"", "%$", "%&", "%(", "%*", "%+", "%-",
"%/", "%1", "%3", "%5", "%7", "%9", "%;", "%=",
"%?", "%A", "%D", "%F", "%H", "%J", "%K", "%L",
"%M", "%N", "%O", "%R", "%U", "%X", "%[", "%^",
"%_", "%`", "%a", "%b", "%d", "%f", "%h", "%i",
"%j", "%k", "%l", "%m", "%o", "%s", "!+", "!,",
};
typedef struct _ConvRoutine {
char key;
Str (*routine) ();
char *ShiftIn, *ShiftOut;
} ConvRoutine;
static ConvRoutine FromEJ[] = {
{CODE_JIS_J, cConvEJ, JSIcode, JSOcode},
{CODE_JIS_N, cConvEJ, NSIcode, NSOcode},
{CODE_JIS_n, cConvEJ, N2SIcode, N2SOcode},
{CODE_JIS_m, cConvEJ, N3SIcode, N3SOcode},
{CODE_JIS_j, cConvEJ, J2SIcode, J2SOcode},
{CODE_SJIS, cConvES, "", ""},
{CODE_EUC, cConvEE, "", ""},
{'\0', NULL, NULL, NULL}
};
static ConvRoutine ToEJ[] = {
{CODE_JIS_J, cConvJE, JSIcode, JSOcode},
{CODE_JIS_N, cConvJE, NSIcode, NSOcode},
{CODE_JIS_n, cConvJE, N2SIcode, N2SOcode},
{CODE_JIS_m, cConvJE, N3SIcode, N3SOcode},
{CODE_JIS_j, cConvJE, J2SIcode, J2SOcode},
{CODE_SJIS, cConvSE, "", ""},
{CODE_EUC, cConvEE, "", ""},
{'\0', NULL, NULL, NULL}
};
char *
GetSICode(char key)
{
int i;
for (i = 0; FromEJ[i].key != '\0'; i++)
if (FromEJ[i].key == key)
return FromEJ[i].ShiftIn;
return "";
}
char *
GetSOCode(char key)
{
int i;
for (i = 0; FromEJ[i].key != '\0'; i++)
if (FromEJ[i].key == key)
return FromEJ[i].ShiftOut;
return "";
}
static void
n_impr(char s)
{
fprintf(stderr,
"conv: option %c(0x%02x) is not implemented yet... sorry\n", s, s);
w3m_exit(1);
}
Str
conv_str(Str is, char fc, char tc)
{
int i;
Str os;
static char from_code = '\0';
static char to_code = '\0';
static Str (*conv_from) ();
static Str (*conv_to) ();
if (fc == tc || fc == CODE_ASCII || tc == CODE_ASCII)
return is;
if (fc == CODE_INNER_EUC)
os = is;
else {
if (from_code != fc) {
for (i = 0; ToEJ[i].key != '\0'; i++) {
if (ToEJ[i].key == fc) {
from_code = fc;
conv_from = *ToEJ[i].routine;
goto next;
}
}
n_impr(fc);
return NULL;
}
next:
os = conv_from(is);
}
if (tc == CODE_INNER_EUC || tc == CODE_EUC)
return os;
else {
if (to_code != tc) {
for (i = 0; FromEJ[i].key != '\0'; i++) {
if (FromEJ[i].key == tc) {
SIcode = FromEJ[i].ShiftIn;
SOcode = FromEJ[i].ShiftOut;
to_code = tc;
conv_to = *FromEJ[i].routine;
goto next2;
}
}
n_impr(tc);
return NULL;
}
next2:
return conv_to(os);
}
}
Str
conv(char *is, char fc, char tc)
{
return conv_str(Strnew_charp(is), fc, tc);
}
static uchar
getSLb(uchar * ptr, uchar * ub)
{ /* Get Shift-JIS Lower byte */
uchar c = *ptr;
*ub <<= 1;
if (c < 0x9f) {
if (c > 0x7e)
c--;
*ub -= 1;
c -= 0x3f;
}
else {
c -= 0x9e;
}
return c;
}
static Str
cConvSE(Str is)
{ /* Convert Shift-JIS to EUC-JP */
uchar *p, ub, lb;
int state = SJIS_NOSTATE;
Str os = Strnew_size(is->length);
uchar *endp = (uchar *) & is->ptr[is->length];
for (p = (uchar *) is->ptr; p < endp; p++) {
switch (state) {
case SJIS_NOSTATE:
if (!(*p & 0x80)) /* ASCII */
Strcat_char(os, (char)(*p));
else if (0x81 <= *p && *p <= 0x9f) { /* JIS X 0208,
* 0213 */
ub = *p & 0x7f;
state = SJIS_SHIFT_L;
}
else if (0xe0 <= *p && *p <= 0xef) { /* JIS X 0208 */
/* } else if (0xe0 <= *p && *p <= 0xfc) { *//* JIS X 0213 */
ub = (*p & 0x7f) - 0x40;
state = SJIS_SHIFT_H;
}
else if (0xa0 <= *p && *p <= 0xdf) { /* JIS X 0201-Kana
*/
Strcat_char(os, (char)(han2zen_tab[*p - 0xa0][0] | 0x80));
Strcat_char(os, (char)(han2zen_tab[*p - 0xa0][1] | 0x80));
}
break;
case SJIS_SHIFT_L:
case SJIS_SHIFT_H:
if ((0x40 <= *p && *p <= 0x7e) || (0x80 <= *p && *p <= 0xfc)) { /* JIS X 0208, 0213 */
lb = getSLb(p, &ub);
ub += 0x20;
lb += 0x20;
Strcat_char(os, (char)(ub | 0x80));
Strcat_char(os, (char)(lb | 0x80));
}
else if (!(*p & 0x80)) /* broken ? */
Strcat_char(os, (char)(*p));
state = SJIS_NOSTATE;
break;
}
}
return os;
}
static Str
cConvJE(Str is)
{ /* Convert ISO-2022-JP to EUC-JP */
uchar *p, ub = 0;
char cset = CSET_ASCII;
int state = ISO_NOSTATE;
Str os = Strnew_size(is->length);
uchar *endp = (uchar *) & is->ptr[is->length];
for (p = (uchar *) is->ptr; p < endp; p++) {
switch (state) {
case ISO_NOSTATE:
if (*p == ESC_CODE) /* ESC sequence */
state = ISO_ESC;
else if (cset == CSET_ASCII || *p < 0x21)
Strcat_char(os, (char)(*p));
else if (cset == CSET_X0208 && *p <= 0x7e) {
/* JIS X 0208 */
ub = *p;
state = ISO_MBYTE1;
}
else if (cset == CSET_X0201K && *p <= 0x5f) {
/* JIS X 0201-Kana */
Strcat_char(os, (char)(han2zen_tab[*p - 0x20][0] | 0x80));
Strcat_char(os, (char)(han2zen_tab[*p - 0x20][1] | 0x80));
}
break;
case ISO_MBYTE1:
if (*p == ESC_CODE) /* ESC sequence */
state = ISO_ESC;
else if (0x21 <= *p && *p <= 0x7e) { /* JIS X 0208 */
Strcat_char(os, (char)(ub | 0x80));
Strcat_char(os, (char)(*p | 0x80));
state = ISO_NOSTATE;
}
else {
Strcat_char(os, (char)(*p));
state = ISO_NOSTATE;
}
break;
case ISO_ESC:
if (*p == '(') /* ESC ( F */
state = ISO_CS94;
else if (*p == '$') /* ESC $ F, ESC $ ( F */
state = ISO_MBCS;
else {
Strcat_char(os, ESC_CODE);
Strcat_char(os, (char)(*p));
state = ISO_NOSTATE;
}
break;
case ISO_CS94:
if (*p == 'B' || *p == 'J' || *p == 'H')
cset = CSET_ASCII;
else if (*p == 'I')
cset = CSET_X0201K;
else {
Strcat_char(os, ESC_CODE);
Strcat_char(os, '(');
Strcat_char(os, (char)(*p));
}
state = ISO_NOSTATE;
break;
case ISO_MBCS:
if (*p == '(') { /* ESC $ ( F */
state = ISO_MBCS | ISO_CS94;
break;
}
case ISO_MBCS | ISO_CS94:
if (*p == 'B' || *p == '@')
cset = CSET_X0208;
else {
Strcat_char(os, ESC_CODE);
Strcat_char(os, '$');
if (state == (ISO_MBCS | ISO_CS94))
Strcat_char(os, '(');
Strcat_char(os, (char)(*p));
}
state = ISO_NOSTATE;
break;
}
}
return os;
}
static Str
_cConvEE(Str is, char is_euc)
{ /* Convert EUC-JP to EUC-JP / ISO-2022-JP
* (no JIS X 0201-Kana, 0212, 0213-2) */
uchar *p, ub = 0, euc = 0;
int state = EUC_NOSTATE;
char cset = CSET_ASCII;
Str os;
uchar *endp = (uchar *) & is->ptr[is->length];
if (is_euc) {
os = Strnew_size(is->length);
euc = 0x80;
}
else
os = Strnew_size(is->length * 3 / 2);
for (p = (uchar *) is->ptr; p < endp; p++) {
switch (state) {
case EUC_NOSTATE:
if (!(*p & 0x80)) { /* ASCII */
if (!is_euc && cset != CSET_ASCII) {
Strcat_charp(os, SOcode);
cset = CSET_ASCII;
}
Strcat_char(os, (char)(*p));
}
else if (0xa1 <= *p && *p <= 0xfe) { /* JIS X 0208,
* 0213-1 */
ub = *p;
state = EUC_MBYTE1;
}
else if (*p == EUC_SS2_CODE) /* SS2 + JIS X 0201-Kana */
state = EUC_SS2;
else if (*p == EUC_SS3_CODE) /* SS3 + JIS X 0212, 0213-2 */
state = EUC_SS3;
break;
case EUC_MBYTE1:
if (0xa1 <= *p && *p <= 0xfe) { /* JIS X 0208, 0213-1 */
if (!is_euc && cset != CSET_X0208) {
Strcat_charp(os, SIcode);
cset = CSET_X0208;
}
Strcat_char(os, (char)((ub & 0x7f) | euc));
Strcat_char(os, (char)((*p & 0x7f) | euc));
}
else if (!(*p & 0x80)) { /* broken ? */
if (!is_euc && cset != CSET_ASCII) {
Strcat_charp(os, SOcode);
cset = CSET_ASCII;
}
Strcat_char(os, (char)(*p));
}
state = EUC_NOSTATE;
break;
case EUC_SS2:
if (0xa0 <= *p && *p <= 0xdf) { /* JIS X 0201-Kana */
if (!is_euc && cset != CSET_X0208) {
Strcat_charp(os, SIcode);
cset = CSET_X0208;
}
Strcat_char(os, (char)(han2zen_tab[*p - 0xa0][0] | euc));
Strcat_char(os, (char)(han2zen_tab[*p - 0xa0][1] | euc));
}
state = EUC_NOSTATE;
break;
case EUC_SS3:
state = (EUC_SS3 | EUC_MBYTE1);
break;
case EUC_SS3 | EUC_MBYTE1:
state = EUC_NOSTATE;
break;
}
}
if (!is_euc && cset != CSET_ASCII)
Strcat_charp(os, SOcode);
return os;
}
static Str
cConvEE(Str is)
{
return _cConvEE(is, TRUE);
}
static Str
cConvEJ(Str is)
{
return _cConvEE(is, FALSE);
}
void
put_sjis(Str os, uchar ub, uchar lb)
{
ub -= 0x20;
lb -= 0x20;
if ((ub & 1) == 0)
lb += 94;
ub = ((ub - 1) >> 1) + 0x81;
lb += 0x3f;
if (ub > 0x9f)
ub += 0x40;
if (lb > 0x7e)
lb++;
Strcat_char(os, (char)(ub));
Strcat_char(os, (char)(lb));
}
static Str
cConvES(Str is)
{ /* Convert EUC-JP to Shift-JIS */
uchar *p, ub = 0;
int state = EUC_NOSTATE;
Str os = Strnew_size(is->length);
uchar *endp = (uchar *) & is->ptr[is->length];
for (p = (uchar *) is->ptr; p < endp; p++) {
switch (state) {
case EUC_NOSTATE:
if (!(*p & 0x80)) /* ASCII */
Strcat_char(os, (char)(*p));
else if (0xa1 <= *p && *p <= 0xfe) { /* JIS X 0208,
* 0213-1 */
ub = *p;
state = EUC_MBYTE1;
}
else if (*p == EUC_SS2_CODE) /* SS2 + JIS X 0201-Kana */
state = EUC_SS2;
else if (*p == EUC_SS3_CODE) /* SS3 + JIS X 0212, 0213-2 */
state = EUC_SS3;
break;
case EUC_MBYTE1:
if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0208, 0213-1 */
put_sjis(os, ub & 0x7f, *p & 0x7f);
else if (!(*p & 0x80)) /* broken ? */
Strcat_char(os, (char)(*p));
state = EUC_NOSTATE;
break;
case EUC_SS2:
if (0xa0 <= *p && *p <= 0xdf) /* JIS X 0201-Kana */
put_sjis(os, han2zen_tab[*p - 0xa0][0],
han2zen_tab[*p - 0xa0][1]);
state = EUC_NOSTATE;
break;
case EUC_SS3:
state = (EUC_SS3 | EUC_MBYTE1);
break;
case EUC_SS3 | EUC_MBYTE1:
state = EUC_NOSTATE;
break;
}
}
return os;
}
/*
* static ushort sjis_shift[8] = { 0x7fff, 0xffff, 0x0, 0x0, 0x0,
* 0x0, 0xffff, 0x0 }; static ushort sjis_second[16] = { 0x0, 0x0,
* 0x0, 0x0, 0xffff, 0xffff, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff,
* 0xffff, 0xffff, 0xffff, 0xffff, 0xfff8 }; */
char
checkShiftCode(Str buf, uchar hint)
{
uchar *p, si = '\0', so = '\0';
int euc = (CODE_NORMAL | EUC_NOSTATE),
sjis = (CODE_NORMAL | SJIS_NOSTATE), sjis_kana = CODE_NORMAL,
iso = (CODE_NORMAL | ISO_NOSTATE), iso_kana = CODE_NORMAL;
uchar *endp = (uchar *) & buf->ptr[buf->length];
if (hint == CODE_INNER_EUC)
return '\0';
p = (uchar *) buf->ptr;
while (1) {
if (iso != CODE_ERROR && (si == '\0' || so == '\0')) {
switch (ISO_STATE(iso)) {
case ISO_NOSTATE:
if (*p == ESC_CODE) /* ESC sequence */
iso = (CODE_STATE(iso) | ISO_ESC);
break;
case ISO_ESC:
if (*p == '(') /* ESC ( F */
iso = (CODE_STATE(iso) | ISO_CS94);
else if (*p == '$') /* ESC $ F, ESC $ ( F */
iso = (CODE_STATE(iso) | ISO_MBCS);
else
iso = (CODE_STATE(iso) | ISO_NOSTATE);
break;
case ISO_CS94:
if (*p == 'B' || *p == 'J' || *p == 'H')
so = *p;
else if (*p == 'I')
iso_kana = CODE_OK;
iso = (CODE_STATE(iso) | ISO_NOSTATE);
break;
case ISO_MBCS:
if (*p == '(') { /* ESC $ ( F */
iso = (CODE_STATE(iso) | ISO_MBCS | ISO_CS94);
break;
}
case ISO_MBCS | ISO_CS94:
if (*p == 'B' || *p == '@')
si = *p;
iso = (CODE_STATE(iso) | ISO_NOSTATE);
break;
}
if (*p & 0x80)
iso = CODE_ERROR;
}
if (euc != CODE_ERROR) {
switch (EUC_STATE(euc)) {
case EUC_NOSTATE:
if (!(*p & 0x80)) /* ASCII */
;
else if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0208, 0213-1 */
euc = (CODE_STATE(euc) | EUC_MBYTE1);
else if (*p == EUC_SS2_CODE) /* SS2 + JIS X 0201-Kana */
euc = (CODE_STATE(euc) | EUC_SS2);
else if (*p == EUC_SS3_CODE) /* SS3 + JIS X 0212, 0213-2 */
euc = (CODE_STATE(euc) | EUC_SS3);
else
euc = CODE_ERROR;
break;
case EUC_MBYTE1:
if (CODE_STATE(euc) == CODE_NORMAL)
euc = CODE_OK;
case EUC_SS3 | EUC_MBYTE1:
if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0208, 0213-1 */
euc = (CODE_STATE(euc) | EUC_NOSTATE);
else if (euc & CODE_BROKEN)
euc = CODE_ERROR;
else
euc = (CODE_BROKEN | EUC_NOSTATE);
break;
case EUC_SS2:
if (0xa0 <= *p && *p <= 0xdf) /* JIS X 0201-Kana */
euc = (CODE_STATE(euc) | EUC_NOSTATE);
else
euc = CODE_ERROR;
break;
case EUC_SS3:
if (0xa1 <= *p && *p <= 0xfe) /* JIS X 0212, 0213-2 */
euc = (CODE_STATE(euc) | EUC_SS3 | EUC_MBYTE1);
else
euc = CODE_ERROR;
break;
}
}
if (sjis != CODE_ERROR) {
switch (SJIS_STATE(sjis)) {
case SJIS_NOSTATE:
if (!(*p & 0x80)) /* ASCII */
;
else if (0x81 <= *p && *p <= 0x9f)
sjis = (CODE_STATE(sjis) | SJIS_SHIFT_L);
else if (0xe0 <= *p && *p <= 0xef)
/* JIS X 0208 */
/* else if (0xe0 <= *p && *p <= 0xfc) */
/* JIS X 0213 */
sjis = (CODE_STATE(sjis) | SJIS_SHIFT_H);
else if (0xa0 == *p)
sjis = (CODE_BROKEN | SJIS_NOSTATE);
else if (0xa1 <= *p && *p <= 0xdf) /* JIS X 0201-Kana
*/
sjis_kana = CODE_OK;
else
sjis = CODE_ERROR;
break;
case SJIS_SHIFT_L:
case SJIS_SHIFT_H:
if (CODE_STATE(sjis) == CODE_NORMAL)
sjis = CODE_OK;
if ((0x40 <= *p && *p <= 0x7e) || (0x80 <= *p && *p <= 0xfc)) /* JIS X 0208, 0213 */
sjis = (CODE_STATE(sjis) | SJIS_NOSTATE);
else if (sjis & CODE_BROKEN)
sjis = CODE_ERROR;
else
sjis = (CODE_BROKEN | SJIS_NOSTATE);
break;
}
}
if (euc == CODE_ERROR || sjis == CODE_ERROR)
break;
if (p == endp)
break;
p++;
}
if (iso != CODE_ERROR) {
if (si == '\0' && so == '\0' && iso_kana != CODE_OK)
return '\0';
switch (si) {
case '@':
switch (so) {
case 'H':
return CODE_JIS_J;
case 'J':
return CODE_JIS_j;
case 'B':
return CODE_JIS_m;
default:
return CODE_JIS_m;
}
case 'B':
switch (so) {
case 'J':
return CODE_JIS_N;
case 'B':
return CODE_JIS_n;
default:
return CODE_JIS_n;
}
default:
switch (so) {
case 'H':
return CODE_JIS_J;
case 'J':
return CODE_JIS_N;
case 'B':
return CODE_JIS_n;
default:
return CODE_JIS_n;
}
}
}
if (hint == CODE_EUC) {
if (euc != CODE_ERROR)
return CODE_EUC;
}
else if (hint == CODE_SJIS) {
if (sjis != CODE_ERROR)
return CODE_SJIS;
}
if (CODE_STATE(euc) == CODE_OK)
return CODE_EUC;
if (CODE_STATE(sjis) == CODE_OK)
return CODE_SJIS;
if (CODE_STATE(euc) == CODE_NORMAL)
return CODE_EUC;
if (CODE_STATE(sjis) == CODE_NORMAL)
return CODE_SJIS;
return CODE_EUC;
}
#endif /* JP_CHARSET */