1 files changed, 138 insertions, 108 deletions
diff --git a/regex.c b/regex.c
index 66c2f1e..120598a 100644
--- a/regex.c
+++ b/regex.c
@@ -1,4 +1,4 @@
-/* $Id: regex.c,v 1.20 2002/12/24 17:20:48 ukai Exp $ */
+/* $Id: regex.c,v 1.21 2003/09/22 21:02:21 ukai Exp $ */
 /* 
  * regex: Regular expression pattern match library
  * 
@@ -14,6 +14,14 @@
 #include <stdlib.h>
 #include <string.h>
 #include <gc.h>
+#include "config.h"
+#ifdef USE_M17N
+#include "wc.h"
+#include "wtf.h"
+#ifdef USE_UNICODE
+#include "ucs.h"
+#endif
+#endif
 #include "regex.h"
 #include "config.h"
 #include "myctype.h"
@@ -22,10 +30,6 @@
 #define NULL	0
 #endif				/* not NULL */
 
-#if LANG == JA
-#define JP_CHARSET
-#endif
-
 #define RE_ITER_LIMIT   65535
 
 #define RE_MATCHMODE	0x07
@@ -51,26 +55,64 @@ char *lc2c(longchar *, int);
 int verbose;
 #endif				/* REGEX_DEBUG */
 
-#ifndef IS_KANJI1
+#ifdef USE_M17N
+#define get_mclen(c) wtf_len1((wc_uchar *)(c))
+#else
+#define get_mclen(c) 1
+#endif
+
+#ifndef TOLOWER
 #include <ctype.h>
-#define IS_KANJI1(x) ((x)&0x80)
 #define TOLOWER(x) tolower(x)
 #define TOUPPER(x) toupper(x)
 #endif
 
-#ifdef JP_CHARSET
-#define RE_KANJI(p)	(((unsigned char)*(p) << 8) | (unsigned char)*((p)+1))
-#endif
+#define RE_TYPE_END     0
+#define RE_TYPE_CHAR    1
+#define RE_TYPE_WCHAR_T 2
+#define RE_WHICH_RANGE  3
+#define RE_TYPE_SYMBOL  4
 
-#define RE_WHICH_RANGE	0xffff
+static longchar
+set_longchar(char *str)
+{
+    unsigned char *p = (unsigned char *)str;
+    longchar r;
+
+#ifdef USE_M17N
+    if (*p & 0x80) {
+	r.wch = wtf_parse1(&p);
+	if (r.wch.ccs == WC_CCS_SPECIAL || r.wch.ccs == WC_CCS_SPECIAL_W) {
+	    r.type = RE_TYPE_SYMBOL;
+	    return r;
+	}
+#ifdef USE_UNICODE
+	if (WC_CCS_IS_UNICODE(r.wch.ccs)) {
+	    if (WC_CCS_SET(r.wch.ccs) == WC_CCS_UCS_TAG)
+		r.wch.code = wc_ucs_tag_to_ucs(r.wch.code);
+	    r.wch.ccs = WC_CCS_UCS4;
+	}
+	else
+#endif
+	    r.wch.ccs = WC_CCS_SET(r.wch.ccs);
+	r.type = RE_TYPE_WCHAR_T;
+	return r;
+    }
+#endif
+    r.ch = *p;
+    r.type = RE_TYPE_CHAR;
+    return r;
+}
 
 static Regex DefaultRegex;
 #define CompiledRegex DefaultRegex.re
 #define Cstorage DefaultRegex.storage
 
 static int regmatch(regexchar *, char *, char *, int, char **);
-static int regmatch1(regexchar *, longchar);
-static int matchWhich(longchar *, longchar, int);
+static int regmatch1(regexchar *, longchar *);
+static int matchWhich(longchar *, longchar *, int);
+static int match_longchar(longchar *, longchar *, int);
+static int match_range_longchar(longchar *, longchar *, longchar *, int);
 
 /* 
  * regexCompile: compile regular expression
@@ -153,21 +195,15 @@ newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level)
 	    else
 		m = RE_WHICH;
 	    if (*p == '-' || *p == ']')
-		*(st_ptr++) = (unsigned char)*(p++);
+		*(st_ptr++) = set_longchar(p);
 	    while (*p != ']') {
 		if (*p == '\\') {
 		    p++;
-#ifdef JP_CHARSET
-		    if (IS_KANJI1(*p)) {
-			*(st_ptr++) = RE_KANJI(p);
-			p += 2;
-		    }
-		    else
-#endif
-			*(st_ptr++) = (unsigned char)*(p++);
+		    *(st_ptr++) = set_longchar(p);
+		    p += get_mclen(p);
 		}
 		else if (*p == '-' && *(p + 1) != ']') {
-		    *(st_ptr++) = RE_WHICH_RANGE;
+		    (st_ptr++)->type = RE_WHICH_RANGE;
 		    p++;
 		}
 		else if (*p == '\0') {
@@ -175,21 +211,17 @@ newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level)
 			*msg = "Missing ]";
 		    return NULL;
 		}
-#ifdef JP_CHARSET
-		else if (IS_KANJI1(*p)) {
-		    *(st_ptr++) = RE_KANJI(p);
-		    p += 2;
+		else {
+		    *(st_ptr++) = set_longchar(p);
+		    p += get_mclen(p);
 		}
-#endif
-		else
-		    *(st_ptr++) = (unsigned char)*(p++);
 		if (st_ptr >= &regex->storage[STORAGE_MAX]) {
 		    if (msg)
 			*msg = "Regular expression too long";
 		    return NULL;
 		}
 	    }
-	    *(st_ptr++) = '\0';
+	    (st_ptr++)->type = RE_TYPE_END;
 	    re->p.pattern = r;
 	    RE_SET_MODE(re, m);
 	    if (igncase)
@@ -226,14 +258,8 @@ newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level)
 	case '\\':
 	    p++;
 	default:
-#ifdef JP_CHARSET
-	    if (IS_KANJI1(*p)) {
-		*(st_ptr) = RE_KANJI(p);
-		p++;
-	    }
-	    else
-#endif
-		*st_ptr = (unsigned char)*p;
+	    *(st_ptr) = set_longchar(p);
+	    p += get_mclen(p) - 1;
 	    re->p.pattern = st_ptr;
 	    st_ptr++;
 	    RE_SET_MODE(re, RE_NORMAL);
@@ -302,10 +328,7 @@ RegexMatch(Regex *re, char *str, int len, int firstp)
 	    /* matched */
 	    return 1;
 	}
-#ifdef JP_CHARSET
-	if (IS_KANJI1(*p))
-	    p++;
-#endif
+	p += get_mclen(p) - 1;
     }
     return 0;
 }
@@ -471,24 +494,11 @@ regmatch_iter(struct MatchingContext1 *c,
 			}
 			return 0;
 		    }
-#ifdef JP_CHARSET
-		    else if (IS_KANJI1(c->str[c->n_any])) {
-			longchar k;
-			k = RE_KANJI(c->str + c->n_any);
-			if (regmatch1(c->re, k)) {
-			    c->n_any += 2;
-			}
-			else {
-			    return 0;
-			}
-			c->firstp = 0;
-		    }
-#endif
 		    else {
 			longchar k;
-			k = (unsigned char)c->str[c->n_any];
-			if (regmatch1(c->re, k)) {
-			    c->n_any++;
+			k = set_longchar(c->str + c->n_any);
+			if (regmatch1(c->re, &k)) {
+			    c->n_any += get_mclen(c->str + c->n_any);
 			}
 			else {
 			    return 0;
@@ -553,20 +563,11 @@ regmatch_iter(struct MatchingContext1 *c,
 	    }
 	    return 0;
 	default:
-#ifdef JP_CHARSET
-	    if (IS_KANJI1(*c->str)) {
-		longchar k;
-		k = RE_KANJI(c->str);
-		c->str += 2;
-		if (!regmatch1(c->re, k))
-		    return 0;
-	    }
-	    else
-#endif
 	    {
 		longchar k;
-		k = (unsigned char)*(c->str++);
-		if (!regmatch1(c->re, k))
+		k = set_longchar(c->str);
+		c->str += get_mclen(c->str);
+		if (!regmatch1(c->re, &k))
 		    return 0;
 	    }
 	    c->re++;
@@ -613,29 +614,29 @@ regmatch(regexchar * re, char *str, char *end_p, int firstp, char **lastpos)
 
 
 static int
-regmatch1(regexchar * re, longchar c)
+regmatch1(regexchar * re, longchar * c)
 {
+    int ans;
+
+#ifdef USE_M17N
+    if (c->type == RE_TYPE_SYMBOL)
+	return 0;
+#endif
     switch (RE_MODE(re)) {
     case RE_ANY:
 #ifdef REGEX_DEBUG
 	if (verbose)
-	    printf("%c vs any. -> 1\n", c);
+	    printf("%s vs any. -> 1\n", lc2c(c, 1));
 #endif				/* REGEX_DEBUG */
 	return 1;
     case RE_NORMAL:
+	ans = match_longchar(re->p.pattern, c, re->mode & RE_IGNCASE);
 #ifdef REGEX_DEBUG
 	if (verbose)
-	    printf("RE=%c vs %c -> %d\n", *re->p.pattern, c,
-		   *re->p.pattern == c);
-#endif				/* REGEX_DEBUG */
-	if (re->mode & RE_IGNCASE) {
-	    if (*re->p.pattern < 127 && c < 127)
-		return TOLOWER(*re->p.pattern) == TOLOWER(c);
-	    else
-		return *re->p.pattern == c;
-	}
-	else
-	    return (*re->p.pattern == c);
+	    printf("RE=%s vs %s -> %d\n", lc2c(re->p.pattern, 1), lc2c(c, 1),
+		   ans);
+#endif                         /* REGEX_DEBUG */
+	return ans;
     case RE_WHICH:
 	return matchWhich(re->p.pattern, c, re->mode & RE_IGNCASE);
     case RE_EXCEPT:
@@ -645,36 +646,25 @@ regmatch1(regexchar * re, longchar c)
 }
 
 static int
-matchWhich(longchar * pattern, longchar c, int igncase)
+matchWhich(longchar * pattern, longchar * c, int igncase)
 {
     longchar *p = pattern;
     int ans = 0;
 
 #ifdef REGEX_DEBUG
     if (verbose)
-	printf("RE pattern = %s char=%s", lc2c(pattern, 10000), lc2c(&c, 1));
+	printf("RE pattern = %s char=%s", lc2c(pattern, 10000), lc2c(c, 1));
 #endif				/* REGEX_DEBUG */
-    while (*p != '\0') {
-	if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') {	/* Char class. */
-	    if (*p <= c && c <= *(p + 2)) {
-		ans = 1;
-		break;
-	    }
-	    else if (igncase && c < 127 &&
-		     ((*p <= TOLOWER(c) && TOLOWER(c) <= *(p + 2)) ||
-		      (*p <= TOUPPER(c) && TOUPPER(c) <= *(p + 2)))) {
+    while (p->type != RE_TYPE_END) {
+	if ((p + 1)->type == RE_WHICH_RANGE && (p + 2)->type != RE_TYPE_END) {
+	    if (match_range_longchar(p, p + 2, c, igncase)) {
 		ans = 1;
 		break;
 	    }
 	    p += 3;
 	}
 	else {
-	    if (*p == c) {
-		ans = 1;
-		break;
-	    }
-	    else if (igncase && c < 127 &&
-		     (*p == TOLOWER(c) || *p == TOUPPER(c))) {
+	    if (match_longchar(p, c, igncase)) {
 		ans = 1;
 		break;
 	    }
@@ -688,23 +678,60 @@ matchWhich(longchar * pattern, longchar c, int igncase)
     return ans;
 }
 
+static int
+match_longchar(longchar * a, longchar * b, int ignore)
+{
+#ifdef USE_M17N
+    if (a->type != b->type)
+	return 0;
+    if (a->type == RE_TYPE_WCHAR_T)
+	return (a->wch.ccs == b->wch.ccs) && (a->wch.code == b->wch.code);
+#endif
+    if (ignore && IS_ALPHA(b->ch))
+	return (a->ch == TOLOWER(b->ch) || a->ch == TOUPPER(b->ch));
+    else
+        return a->ch == b->ch;
+}
+
+static int
+match_range_longchar(longchar * a, longchar * b, longchar * c, int ignore)
+{
+#ifdef USE_M17N
+    if (a->type != b->type || a->type != c->type)
+	return 0;
+    if (a->type == RE_TYPE_WCHAR_T)
+	return ((a->wch.ccs == c->wch.ccs && c->wch.ccs == b->wch.ccs) &&
+		(a->wch.code <= c->wch.code && c->wch.code <= b->wch.code));
+#endif
+    if (ignore && IS_ALPHA(c->ch))
+	return ((a->ch <= TOLOWER(c->ch) && TOLOWER(c->ch) <= b->ch) ||
+		(a->ch <= TOUPPER(c->ch) && TOUPPER(c->ch) <= b->ch));
+    else
+	return (a->ch <= c->ch && c->ch <= b->ch);
+}
+
 #ifdef REGEX_DEBUG
 char *
 lc2c(longchar * x, int len)
 {
     static char y[100];
-    int i = 0;
+    int i = 0, j = 0;
     char *r;
 
-    while (x[i] && i < len) {
-	if (x[i] == RE_WHICH_RANGE)
+    while (x[j].type != RE_TYPE_END && j < len) {
+	if (x[j].type == RE_WHICH_RANGE)
 	    y[i++] = '-';
-	else if (x[i] >= 128) {
-	    y[i++] = ((x[i] >> 8) & 0xff);
-	    y[i++] = (x[i] & 0xff);
+#ifdef USE_M17N
+	else if (x[j].type == RE_TYPE_WCHAR_T) {
+	    char buf[20];
+	    sprintf(buf, "[%x-%x]", x[j].wch.ccs, x[j].wch.code);
+	    strcpy(&y[i], buf);
+	    i += strlen(buf);
 	}
+#endif
 	else
-	    y[i++] = x[i];
+	    y[i++] = x[j].ch;
+	j++;
     }
     y[i] = '\0';
     r = GC_malloc_atomic(i + 1);
@@ -774,6 +801,9 @@ main(int argc, char **argv)
     FILE *f = stdin;
     int i = 1;
 
+#ifdef USE_M17N
+    wtf_init(WC_CES_EUC_JP, WC_CES_EUC_JP);
+#endif
 #ifdef REGEX_DEBUG
     for (i = 1; i < argc; i++) {
 	if (strcmp(argv[i], "-v") == 0)