From 919adb4b57977d5e375dab0fa943b6e81fa145ab Mon Sep 17 00:00:00 2001 From: Ito Hiroyuki Date: Tue, 24 Aug 2010 10:11:51 +0000 Subject: [w3m-dev 04393] [patch] locale-related character management --- ChangeLog | 24 +- libwc/map/mk_ucs_case_map.pl | 48 + libwc/map/mk_ucs_isdigit_map.pl | 56 + libwc/map/ucs_case.map | 2600 +++++++++++++++++++++++++++++++++++++++ libwc/map/ucs_isalpha.map | 469 +++++++ libwc/map/ucs_isdigit.map | 30 + libwc/map/ucs_islower.map | 471 +++++++ libwc/map/ucs_isupper.map | 455 +++++++ libwc/ucs.c | 75 ++ libwc/ucs.h | 8 + main.c | 130 +- regex.c | 39 +- 12 files changed, 4348 insertions(+), 57 deletions(-) create mode 100644 libwc/map/mk_ucs_case_map.pl create mode 100644 libwc/map/mk_ucs_isdigit_map.pl create mode 100644 libwc/map/ucs_case.map create mode 100644 libwc/map/ucs_isalpha.map create mode 100644 libwc/map/ucs_isdigit.map create mode 100644 libwc/map/ucs_islower.map create mode 100644 libwc/map/ucs_isupper.map diff --git a/ChangeLog b/ChangeLog index b20b762..8cbbb6b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +2010-08-24 Karsten Schoelzel + + * [w3m-dev 04393] [patch] locale-related character management + * http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=313365 + * libwc/map/ucs_isalpha.map: added + * libwc/map/ucs_islower.map: added + * libwc/map/ucs_isupper.map: added + * libwc/map/mk_ucs_case_map.pl: added + * libwc/map/ucs_case.map: added + * libwc/map/mk_ucs_isdigit_map.pl: added + * libwc/map/ucs_isdigit.map: added + * libwc/ucs.c (wc_any_to_ucs): ceck if cc.ccs == WC_CCS_US_ASCII + (wc_is_ucs_alpha, wc_is_ucs_digit, wc_is_ucs_alnum) + (wc_is_ucs_lower, wc_is_ucs_upper, wc_ucs_toupper) + (wc_ucs_tolower, wc_ucs_totitle): added + * regex.c (match_longchar, match_range_longchar): use wc_any_to_ucs(), wc_ucs_tolower(), wc_ucs_toupper() and wc_ucs_totitle() + * main.c (nextChar, prevChar, getChar, is_wordchar): added + (movLW): use prevChar() and is_wordchar() + (movRW): use nextChar() and is_wordchar() + (getCurWord): remove 4th parameter + (getCurWord): use prevChar(), nextChar() and is_wordchar() + 2010-08-20 Karsten Schoelzel * [w3m-dev 04401] Re: mailto with options handling @@ -9172,4 +9194,4 @@ a * [w3m-dev 03276] compile error on EWS4800 * release-0-2-1 * import w3m-0.2.1 -$Id: ChangeLog,v 1.1037 2010/08/20 09:47:09 htrb Exp $ +$Id: ChangeLog,v 1.1038 2010/08/24 10:11:51 htrb Exp $ diff --git a/libwc/map/mk_ucs_case_map.pl b/libwc/map/mk_ucs_case_map.pl new file mode 100644 index 0000000..ee48826 --- /dev/null +++ b/libwc/map/mk_ucs_case_map.pl @@ -0,0 +1,48 @@ + +open(MAP, "> ucs_case.map"); +print MAP <) { + chop; + ($name, $col) = split; + + @cp = (); + + open(UCD, "< private/UnicodeData-4.1.0.txt"); + while() { + chop; + @entry = split(';'); + last if $entry[0] =~ m/.{5,}/; + if ($entry[$col] ne '') { + push (@cp, $entry[0]); + $map{$entry[0]} = $entry[$col]; + } + } + close UCD; + + $nocp = @cp; + + print MAP < ucs_isdigit.map"); +print MAP <) { + chop; + ($name, $class) = split; + + @cp = (); + + open(UCD, "< private/UnicodeData-4.1.0.txt"); + while() { + chop; + @entry = split(';'); + last if $entry[0] =~ m/.{5,}/; + if ($entry[2] eq $class) { + push (@cp, $entry[0]); + } + } + close UCD; + + @bs = (); + $last = -1; + $seq = -1; + for my $e (@cp) { + if (++$last != hex $e) { + $seq = $e; + $last = hex $e; + push (@bs, $seq); + } + $end{$seq} = $e; + } + $nobs = @bs; + + print MAP < WC_F_CS94_END) return WC_C_UCS4_ERROR; map = cs94_ucs_map[f - WC_F_ISO_BASE]; @@ -558,6 +565,74 @@ wc_is_ucs_hangul(wc_uint32 ucs) ucs_hangul_map, N_ucs_hangul_map) != NULL); } +wc_bool +wc_is_ucs_alpha(wc_uint32 ucs) +{ + return (ucs <= WC_C_UCS2_END && + wc_map_range_search((wc_uint16)ucs, + ucs_isalpha_map, N_ucs_isalpha_map) != NULL); +} + +wc_bool +wc_is_ucs_digit(wc_uint32 ucs) +{ + return (ucs <= WC_C_UCS2_END && + wc_map_range_search((wc_uint16)ucs, + ucs_isdigit_map, N_ucs_isdigit_map) != NULL); +} + +wc_bool +wc_is_ucs_alnum(wc_uint32 ucs) +{ + return (wc_is_ucs_alpha(ucs) || wc_is_ucs_digit(ucs)); +} + +wc_bool +wc_is_ucs_lower(wc_uint32 ucs) +{ + return (ucs <= WC_C_UCS2_END && + wc_map_range_search((wc_uint16)ucs, + ucs_islower_map, N_ucs_islower_map) != NULL); +} + +wc_bool +wc_is_ucs_upper(wc_uint32 ucs) +{ + return (ucs <= WC_C_UCS2_END && + wc_map_range_search((wc_uint16)ucs, + ucs_isupper_map, N_ucs_isupper_map) != NULL); +} + +wc_uint32 +wc_ucs_toupper(wc_uint32 ucs) +{ + wc_map *conv = NULL; + if (ucs <= WC_C_UCS2_END) + conv = wc_map_search((wc_uint16)ucs, + ucs_toupper_map, N_ucs_toupper_map); + return conv ? (wc_uint32)(conv->code2) : ucs; +} + +wc_uint32 +wc_ucs_tolower(wc_uint32 ucs) +{ + wc_map *conv = NULL; + if (ucs <= WC_C_UCS2_END) + conv = wc_map_search((wc_uint16)ucs, + ucs_tolower_map, N_ucs_tolower_map); + return conv ? (wc_uint32)(conv->code2) : ucs; +} + +wc_uint32 +wc_ucs_totitle(wc_uint32 ucs) +{ + wc_map *conv = NULL; + if (ucs <= WC_C_UCS2_END) + conv = wc_map_search((wc_uint16)ucs, + ucs_totitle_map, N_ucs_totitle_map); + return conv ? (wc_uint32)(conv->code2) : ucs; +} + wc_uint32 wc_ucs_precompose(wc_uint32 ucs1, wc_uint32 ucs2) { diff --git a/libwc/ucs.h b/libwc/ucs.h index 5a3138f..261351e 100644 --- a/libwc/ucs.h +++ b/libwc/ucs.h @@ -48,6 +48,14 @@ extern wc_bool wc_is_ucs_ambiguous_width(wc_uint32 ucs); extern wc_bool wc_is_ucs_wide(wc_uint32 ucs); extern wc_bool wc_is_ucs_combining(wc_uint32 ucs); extern wc_bool wc_is_ucs_hangul(wc_uint32 ucs); +extern wc_bool wc_is_ucs_alpha(wc_uint32 ucs); +extern wc_bool wc_is_ucs_digit(wc_uint32 ucs); +extern wc_bool wc_is_ucs_alnum(wc_uint32 ucs); +extern wc_bool wc_is_ucs_lower(wc_uint32 ucs); +extern wc_bool wc_is_ucs_upper(wc_uint32 ucs); +extern wc_uint32 wc_ucs_toupper(wc_uint32 ucs); +extern wc_uint32 wc_ucs_tolower(wc_uint32 ucs); +extern wc_uint32 wc_ucs_totitle(wc_uint32 ucs); extern wc_uint32 wc_ucs_precompose(wc_uint32 ucs1, wc_uint32 ucs2); extern wc_uint32 wc_ucs_to_fullwidth(wc_uint32 ucs); extern int wc_ucs_put_tag(char *tag); diff --git a/main.c b/main.c index 1f972cc..b421943 100644 --- a/main.c +++ b/main.c @@ -1,4 +1,4 @@ -/* $Id: main.c,v 1.269 2010/08/20 09:47:09 htrb Exp $ */ +/* $Id: main.c,v 1.270 2010/08/24 10:11:51 htrb Exp $ */ #define MAINPROGRAM #include "fm.h" #include @@ -14,6 +14,13 @@ #include "terms.h" #include "myctype.h" #include "regex.h" +#ifdef USE_M17N +#include "wc.h" +#include "wtf.h" +#ifdef USE_UNICODE +#include "ucs.h" +#endif +#endif #ifdef USE_MOUSE #ifdef USE_GPM #include @@ -85,8 +92,7 @@ static void keyPressEventProc(int c); int show_params_p = 0; void show_params(FILE * fp); -static char *getCurWord(Buffer *buf, int *spos, int *epos, - const char *badchars); +static char *getCurWord(Buffer *buf, int *spos, int *epos); static int display_ok = FALSE; static void do_dump(Buffer *); @@ -2248,7 +2254,32 @@ DEFUN(movR1, MOVE_RIGHT1, * From: Takashi Nishimoto Date: Mon, 14 Jun * 1999 09:29:56 +0900 */ -#define IS_WORD_CHAR(c,p) (IS_ALNUM(c) && CharType(p) == PC_ASCII) +#if defined(USE_M17N) && defined(USE_UNICODE) +#define nextChar(s, l) do { (s)++; } while ((s) < (l)->len && (l)->propBuf[s] & PC_WCHAR2) +#define prevChar(s, l) do { (s)--; } while ((s) > 0 && (l)->propBuf[s] & PC_WCHAR2) + +static wc_uint32 +getChar(char *p) +{ + return wc_any_to_ucs(wtf_parse1(&p)); +} + +static int +is_wordchar(wc_uint32 c) +{ + return wc_is_ucs_alnum(c); +} +#else +#define nextChar(s, l) (s)++ +#define prevChar(s, l) (s)-- +#define getChar(p) ((int)*(p)) + +static int +is_wordchar(int c) +{ + return IS_ALNUM(c); +} +#endif static int prev_nonnull_line(Line *line) @@ -2268,8 +2299,7 @@ prev_nonnull_line(Line *line) DEFUN(movLW, PREV_WORD, "Move to previous word") { char *lb; - Lineprop *pb; - Line *pline; + Line *pline, *l; int ppos; int i, n = searchKeyNum(); @@ -2284,12 +2314,14 @@ DEFUN(movLW, PREV_WORD, "Move to previous word") goto end; while (1) { - lb = Currentbuf->currentLine->lineBuf; - pb = Currentbuf->currentLine->propBuf; - while (Currentbuf->pos > 0 && - !IS_WORD_CHAR(lb[Currentbuf->pos - 1], - pb[Currentbuf->pos - 1])) { - Currentbuf->pos--; + l = Currentbuf->currentLine; + lb = l->lineBuf; + while (Currentbuf->pos > 0) { + int tmp = Currentbuf->pos; + prevChar(tmp, l); + if (is_wordchar(getChar(&lb[tmp]))) + break; + Currentbuf->pos = tmp; } if (Currentbuf->pos > 0) break; @@ -2301,12 +2333,14 @@ DEFUN(movLW, PREV_WORD, "Move to previous word") Currentbuf->pos = Currentbuf->currentLine->len; } - lb = Currentbuf->currentLine->lineBuf; - pb = Currentbuf->currentLine->propBuf; - while (Currentbuf->pos > 0 && - IS_WORD_CHAR(lb[Currentbuf->pos - 1], - pb[Currentbuf->pos - 1])) { - Currentbuf->pos--; + l = Currentbuf->currentLine; + lb = l->lineBuf; + while (Currentbuf->pos > 0) { + int tmp = Currentbuf->pos; + prevChar(tmp, l); + if (!is_wordchar(getChar(&lb[tmp]))) + break; + Currentbuf->pos = tmp; } } end: @@ -2333,8 +2367,7 @@ next_nonnull_line(Line *line) DEFUN(movRW, NEXT_WORD, "Move to next word") { char *lb; - Lineprop *pb; - Line *pline; + Line *pline, *l; int ppos; int i, n = searchKeyNum(); @@ -2348,18 +2381,17 @@ DEFUN(movRW, NEXT_WORD, "Move to next word") if (next_nonnull_line(Currentbuf->currentLine) < 0) goto end; - lb = Currentbuf->currentLine->lineBuf; - pb = Currentbuf->currentLine->propBuf; - - while (lb[Currentbuf->pos] && - IS_WORD_CHAR(lb[Currentbuf->pos], pb[Currentbuf->pos])) - Currentbuf->pos++; + l = Currentbuf->currentLine; + lb = l->lineBuf; + while (Currentbuf->pos < l->len && + is_wordchar(getChar(&lb[Currentbuf->pos]))) + nextChar(Currentbuf->pos, l); while (1) { - while (lb[Currentbuf->pos] && - !IS_WORD_CHAR(lb[Currentbuf->pos], pb[Currentbuf->pos])) - Currentbuf->pos++; - if (lb[Currentbuf->pos]) + while (Currentbuf->pos < l->len && + !is_wordchar(getChar(&lb[Currentbuf->pos]))) + nextChar(Currentbuf->pos, l); + if (Currentbuf->pos < l->len) break; if (next_nonnull_line(Currentbuf->currentLine->next) < 0) { Currentbuf->currentLine = pline; @@ -2367,8 +2399,8 @@ DEFUN(movRW, NEXT_WORD, "Move to next word") goto end; } Currentbuf->pos = 0; - lb = Currentbuf->currentLine->lineBuf; - pb = Currentbuf->currentLine->propBuf; + l = Currentbuf->currentLine; + lb = l->lineBuf; } } end: @@ -4893,7 +4925,7 @@ DEFUN(chkWORD, MARK_WORD, "Mark current word as anchor") { char *p; int spos, epos; - p = getCurWord(Currentbuf, &spos, &epos, ":\"\'`<>()[]{}&|;*?$"); + p = getCurWord(Currentbuf, &spos, &epos); if (p == NULL) return; reAnchorWord(Currentbuf, Currentbuf->currentLine, spos, epos); @@ -5505,17 +5537,8 @@ DEFUN(wrapToggle, WRAP_TOGGLE, "Toggle wrap search mode") } } -static int -is_wordchar(int c, const char *badchars) -{ - if (badchars) - return !(IS_SPACE(c) || strchr(badchars, c)); - else - return IS_ALPHA(c); -} - static char * -getCurWord(Buffer *buf, int *spos, int *epos, const char *badchars) +getCurWord(Buffer *buf, int *spos, int *epos) { char *p; Line *l = buf->currentLine; @@ -5527,15 +5550,20 @@ getCurWord(Buffer *buf, int *spos, int *epos, const char *badchars) return NULL; p = l->lineBuf; e = buf->pos; - while (e > 0 && !is_wordchar(p[e], badchars)) - e--; - if (!is_wordchar(p[e], badchars)) + while (e > 0 && !is_wordchar(getChar(&p[e]))) + prevChar(e, l); + if (!is_wordchar(getChar(&p[e]))) return NULL; b = e; - while (b > 0 && is_wordchar(p[b - 1], badchars)) - b--; - while (e < l->len && is_wordchar(p[e], badchars)) - e++; + while (b > 0) { + int tmp = b; + prevChar(tmp, l); + if (!is_wordchar(getChar(&p[tmp]))) + break; + b = tmp; + } + while (e < l->len && is_wordchar(getChar(&p[e]))) + nextChar(e, l); *spos = b; *epos = e; return &p[b]; @@ -5547,7 +5575,7 @@ GetWord(Buffer *buf) int b, e; char *p; - if ((p = getCurWord(buf, &b, &e, 0)) != NULL) { + if ((p = getCurWord(buf, &b, &e)) != NULL) { return Strnew_charp_n(p, e - b)->ptr; } return NULL; diff --git a/regex.c b/regex.c index 09166fc..5bee4b2 100644 --- a/regex.c +++ b/regex.c @@ -1,4 +1,4 @@ -/* $Id: regex.c,v 1.22 2003/09/24 18:49:00 ukai Exp $ */ +/* $Id: regex.c,v 1.23 2010/08/24 10:11:51 htrb Exp $ */ /* * regex: Regular expression pattern match library * @@ -684,8 +684,18 @@ match_longchar(longchar * a, longchar * b, int ignore) #ifdef USE_M17N if (a->type != b->type) return 0; - if (a->type == RE_TYPE_WCHAR_T) + if (a->type == RE_TYPE_WCHAR_T) { +#ifdef USE_UNICODE + if (ignore) { + wc_uint32 ua = wc_any_to_ucs(a->wch), ub = wc_any_to_ucs(b->wch); + return (ua == ub || + ua == wc_ucs_tolower(ub) || + ua == wc_ucs_toupper(ub) || + ua == wc_ucs_totitle(ub)); + } +#endif return (a->wch.ccs == b->wch.ccs) && (a->wch.code == b->wch.code); + } #endif if (ignore && IS_ALPHA(b->ch)) return (a->ch == TOLOWER(b->ch) || a->ch == TOUPPER(b->ch)); @@ -699,9 +709,28 @@ match_range_longchar(longchar * a, longchar * b, longchar * c, int ignore) #ifdef USE_M17N if (a->type != b->type || a->type != c->type) return 0; - if (a->type == RE_TYPE_WCHAR_T) - return ((a->wch.ccs == c->wch.ccs && c->wch.ccs == b->wch.ccs) && - (a->wch.code <= c->wch.code && c->wch.code <= b->wch.code)); + if (a->type == RE_TYPE_WCHAR_T) { + if (a->wch.ccs != c->wch.ccs || c->wch.ccs != b->wch.ccs) + return 0; +#ifdef USE_UNICODE + if (ignore) { + wc_uint32 uc = wc_any_to_ucs(c->wch); + + if (wc_is_ucs_alpha(uc)) { + wc_uint32 ua = wc_any_to_ucs(a->wch); + wc_uint32 ub = wc_any_to_ucs(b->wch); + wc_uint32 upper = wc_ucs_toupper(uc); + wc_uint32 lower = wc_ucs_tolower(uc); + wc_uint32 title = wc_ucs_totitle(uc); + + return ((ua <= upper && upper <= ub) || + (ua <= lower && lower <= ub) || + (ua <= title && title <= ub)); + } + } +#endif + return (a->wch.code <= c->wch.code && c->wch.code <= b->wch.code); + } #endif if (ignore && IS_ALPHA(c->ch)) return ((a->ch <= TOLOWER(c->ch) && TOLOWER(c->ch) <= b->ch) || -- cgit v1.2.3