diff options
author | Fumitoshi UKAI <ukai@debian.or.jp> | 2002-01-10 04:55:06 +0000 |
---|---|---|
committer | Fumitoshi UKAI <ukai@debian.or.jp> | 2002-01-10 04:55:06 +0000 |
commit | 31d84e0083f0ee9ef6fea93ac85e537cf1688ca7 (patch) | |
tree | 54a0d08accf6728d193902418db7a2bfec23859f /regex.c | |
parent | [w3m-dev 02810] (diff) | |
download | w3m-31d84e0083f0ee9ef6fea93ac85e537cf1688ca7.tar.gz w3m-31d84e0083f0ee9ef6fea93ac85e537cf1688ca7.zip |
[w3m-dev 02811] new regexp implementation
From: aito@fw.ipsj.or.jp
Diffstat (limited to '')
-rw-r--r-- | regex.c | 625 |
1 files changed, 495 insertions, 130 deletions
@@ -1,18 +1,60 @@ -/* $Id: regex.c,v 1.6 2001/11/30 10:10:24 ukai Exp $ */ +/* $Id: regex.c,v 1.7 2002/01/10 04:55:07 ukai Exp $ */ /* * regex: Regular expression pattern match library * * by A.ITO, December 1989 + * Revised by A.ITO, January 2002 */ #ifdef REGEX_DEBUG #include <sys/types.h> #include <malloc.h> #endif /* REGEX_DEBUG */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include <ctype.h> #include <gc.h> -#include "fm.h" #include "regex.h" +#include "config.h" + +#ifndef NULL +#define NULL 0 +#endif /* not NULL */ + +#if LANG == JA +#define JP_CHARSET +#endif + +#define RE_ITER_LIMIT 65535 + +#define RE_MATCHMODE 0x07 +#define RE_NORMAL 0x00 +#define RE_ANY 0x01 +#define RE_WHICH 0x02 +#define RE_EXCEPT 0x03 +#define RE_SUBREGEX 0x04 +#define RE_BEGIN 0x05 +#define RE_END 0x06 +#define RE_ENDMARK 0x07 + +#define RE_OPT 0x08 +#define RE_ANYTIME 0x10 +#define RE_IGNCASE 0x40 + +#define RE_MODE(x) ((x)->mode&RE_MATCHMODE) +#define RE_SET_MODE(x,v) ((x)->mode = (((x)->mode&~RE_MATCHMODE)|((v)&RE_MATCHMODE))) + +#ifdef REGEX_DEBUG +void debugre(regexchar *); +char *lc2c(longchar *, int); +int verbose; +#endif /* REGEX_DEBUG */ + +#ifndef IS_ALPHA +#define IS_ALPHA(x) (!((x)&0x80) && isalpha(x)) +#define IS_KANJI1(x) ((x)&0x80) +#endif #ifdef JP_CHARSET #define RE_KANJI(p) (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1)) @@ -24,13 +66,10 @@ static Regex DefaultRegex; #define CompiledRegex DefaultRegex.re #define Cstorage DefaultRegex.storage -static longchar *st_ptr; - -static int regmatch(regexchar *, char *, int, int, char **); +static int regmatch(regexchar *, char *, int, char **); static int regmatch1(regexchar *, longchar); static int matchWhich(longchar *, longchar); - /* * regexCompile: compile regular expression */ @@ -42,41 +81,66 @@ regexCompile(char *ex, int igncase) return msg; } -Regex * -newRegex(char *ex, int igncase, Regex *regex, char **msg) +static Regex * +newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level) { char *p; longchar *r; - regexchar *re = regex->re - 1; + regexchar *re; int m; + longchar *st_ptr; - if (regex == 0) - regex = (Regex *)GC_malloc_atomic(sizeof(Regex)); + if (regex == NULL) + regex = (Regex *)GC_malloc(sizeof(Regex)); + regex->alt_regex = NULL; + re = regex->re; st_ptr = regex->storage; - for (p = ex; *p != '\0'; p++) { + for (p = *ex; *p != '\0'; p++) { + re->mode = 0; switch (*p) { case '.': + re->p.pattern = NULL; + RE_SET_MODE(re, RE_ANY); re++; - re->pattern = NULL; - re->mode = RE_ANY; break; case '$': + re->p.pattern = NULL; + RE_SET_MODE(re, RE_END); re++; - re->pattern = NULL; - re->mode = RE_END; break; case '^': + re->p.pattern = NULL; + RE_SET_MODE(re, RE_BEGIN); re++; - re->pattern = NULL; - re->mode = RE_BEGIN; break; - case '*': - if (!(re->mode & RE_ANY) && re->pattern == NULL) { + case '+': + if (re == regex->re || + (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) { if (msg) *msg = "Invalid regular expression"; return NULL; } + *re = *(re - 1); re->mode |= RE_ANYTIME; + re++; + break; + case '*': + if (re == regex->re || + (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) { + if (msg) + *msg = "Invalid regular expression"; + return NULL; + } + (re - 1)->mode |= RE_ANYTIME; + break; + case '?': + if (re == regex->re || + (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) { + if (msg) + *msg = "Invalid regular expression"; + return NULL; + } + (re - 1)->mode |= RE_OPT; break; case '[': r = st_ptr; @@ -110,14 +174,40 @@ newRegex(char *ex, int igncase, Regex *regex, char **msg) *(st_ptr++) = (unsigned char)*(p++); } *(st_ptr++) = '\0'; + re->p.pattern = r; + RE_SET_MODE(re, m); + re++; + break; + case '|': + RE_SET_MODE(re, RE_ENDMARK); + re++; + p++; + regex->alt_regex = newRegex0(&p, igncase, NULL, msg, level); + if (regex->alt_regex == NULL) + return NULL; + *ex = p; + return regex; + case '(': + RE_SET_MODE(re, RE_SUBREGEX); + p++; + re->p.sub = newRegex0(&p, igncase, NULL, msg, level + 1); + if (re->p.sub == NULL) + return NULL; re++; - re->pattern = r; - re->mode = m; break; + case ')': + if (level == 0) { + if (msg) + *msg = "Too many ')'"; + return NULL; + } + RE_SET_MODE(re, RE_ENDMARK); + re++; + *ex = p; + return regex; case '\\': p++; default: - re++; #ifdef JP_CHARSET if (IS_KANJI1(*p)) { *(st_ptr) = RE_KANJI(p); @@ -126,26 +216,33 @@ newRegex(char *ex, int igncase, Regex *regex, char **msg) else #endif *st_ptr = (unsigned char)*p; - re->pattern = st_ptr; + re->p.pattern = st_ptr; st_ptr++; - re->mode = RE_NORMAL; + RE_SET_MODE(re, RE_NORMAL); if (igncase) re->mode |= RE_IGNCASE; + re++; } - if (st_ptr >= &Cstorage[STORAGE_MAX] || - re >= &CompiledRegex[REGEX_MAX]) { + if (st_ptr >= ®ex->storage[STORAGE_MAX] || + re >= ®ex->re[REGEX_MAX]) { if (msg) *msg = "Regular expression too long"; return NULL; } } - re++; - re->mode = RE_ENDMARK; + RE_SET_MODE(re, RE_ENDMARK); if (msg) *msg = NULL; + *ex = p; return regex; } +Regex * +newRegex(char *ex, int igncase, Regex *regex, char **msg) +{ + return newRegex0(&ex, igncase, regex, msg, 0); +} + /* * regexMatch: match regular expression */ @@ -159,20 +256,33 @@ int RegexMatch(Regex *re, char *str, int len, int firstp) { char *p, *ep; + char *lpos; + Regex *r; if (str == NULL) return 0; + if (len == 0) + len = strlen(str); re->position = NULL; - ep = str + ((len == 0) ? strlen(str) : len); + ep = str + len; for (p = str; p < ep; p++) { - switch (regmatch - (re->re, p, ep - p, firstp && (p == str), &re->lposition)) { - case 1: - re->position = p; + lpos = NULL; + re->lposition = NULL; + for (r = re; r != NULL; r = r->alt_regex) { + switch (regmatch(r->re, p, firstp && (p == str), &lpos)) { + case 1: /* matched */ + re->position = p; + if (re->lposition == NULL || re->lposition < lpos) + re->lposition = lpos; + break; + case -1: /* error */ + re->position = NULL; + return -1; + } + } + if (re->lposition != NULL) { + /* matched */ return 1; - case -1: - re->position = NULL; - return -1; } #ifdef JP_CHARSET if (IS_KANJI1(*p)) @@ -202,118 +312,306 @@ matchedPosition(char **first, char **last) /* * Intermal routines */ + +struct MatchingContext1 { + int label; + regexchar *re; + char *lastpos; + char *str; + int iter_limit; + int n_any; + int firstp; + char *end_p; + Regex *sub_regex; + struct MatchingContext1 *sub_ctx; + struct MatchingContext2 *ctx2; +}; + +struct MatchingContext2 { + int label; + Regex *regex; + char *lastpos; + struct MatchingContext1 *ctx; + struct MatchingContext2 *ctx2; + char *str; + int n_any; + int firstp; +}; + + +#define YIELD(retval,context,lnum) (context)->label = lnum; return (retval); label##lnum: + +static int regmatch_iter(struct MatchingContext1 *, regexchar *, char *, int); + static int -regmatch(regexchar * re, char *str, int len, int firstp, char **lastpos) +regmatch_sub_anytime(struct MatchingContext2 *c, Regex *regex, + regexchar * pat2, char *str, int iter_limit, int firstp) { - char *p = str, *ep = str + len; - char *lpos, *llpos = NULL; - longchar k; + switch (c->label) { + case 1: + goto label1; + case 2: + goto label2; + case 3: + goto label3; + } + c->ctx = GC_malloc(sizeof(struct MatchingContext1)); + c->ctx2 = GC_malloc(sizeof(struct MatchingContext2)); + c->ctx->label = 0; + c->regex = regex; + c->n_any = 0; + c->str = str; + c->firstp = firstp; + for (;;) { + c->ctx->label = 0; + while (regmatch_iter(c->ctx, c->regex->re, c->str, c->firstp)) { + c->n_any = c->ctx->lastpos - c->str; + if (c->n_any <= 0) + continue; + c->firstp = 0; + if (RE_MODE(pat2) == RE_ENDMARK) { + c->lastpos = c->str + c->n_any; + YIELD(1, c, 1); + } + else if (regmatch(pat2, c->str + c->n_any, + c->firstp, &c->lastpos) == 1) { + YIELD(1, c, 2); + } + if (iter_limit == 1) + continue; + c->ctx2->label = 0; + while (regmatch_sub_anytime(c->ctx2, regex, pat2, + c->str + c->n_any, iter_limit - 1, + c->firstp)) { - *lastpos = NULL; -#ifdef REGEX_DEBUG - debugre(re, str); -#endif /* REGEX_DEBUG */ - while ((re->mode & RE_ENDMARK) == 0) { - if (re->mode & RE_BEGIN) { - if (!firstp) - return 0; - re++; + c->lastpos = c->ctx2->lastpos; + YIELD(1, c, 3); + } } - else if (re->mode & RE_ANYTIME) { - short matched, ok = 0; - for (;;) { - matched = 0; - if (regmatch(re + 1, p, ep - p, firstp, &lpos) == 1) { - llpos = lpos; - matched = 1; - ok = 1; + if (c->regex->alt_regex == NULL) + break; + c->regex = c->regex->alt_regex; + } + return 0; +} + +static int +regmatch_iter(struct MatchingContext1 *c, + regexchar * re, char *str, int firstp) +{ + switch (c->label) { + case 1: + goto label1; + case 2: + goto label2; + case 3: + goto label3; + case 4: + goto label4; + case 5: + goto label5; + case 6: + goto label6; + case 7: + goto label7; + } + if (RE_MODE(re) == RE_ENDMARK) + return 0; + c->re = re; + c->end_p = str + strlen(str); + c->firstp = firstp; + c->str = str; + c->sub_ctx = NULL; + while (RE_MODE(c->re) != RE_ENDMARK) { + if (c->re->mode & (RE_ANYTIME | RE_OPT)) { + if (c->re->mode & RE_ANYTIME) + c->iter_limit = RE_ITER_LIMIT; + else + c->iter_limit = 1; + c->n_any = -1; + while (c->n_any < c->iter_limit) { + if (c->str + c->n_any >= c->end_p) { + return 0; } - if (p >= ep) - break; + if (c->n_any >= 0) { + if (RE_MODE(c->re) == RE_SUBREGEX) { + c->ctx2 = GC_malloc(sizeof(struct MatchingContext2)); + c->ctx2->label = 0; + while (regmatch_sub_anytime(c->ctx2, + c->re->p.sub, + c->re + 1, + c->str + c->n_any, + c->iter_limit, + c->firstp)) { + c->n_any = c->ctx2->lastpos - c->str; + c->lastpos = c->ctx2->lastpos; + YIELD(1, c, 1); + } + return 0; + } #ifdef JP_CHARSET - if (IS_KANJI1(*p)) { - k = RE_KANJI(p); - if (regmatch1(re, k)) { - if (lastpos != NULL) - *lastpos = llpos; - p += 2; + else if (IS_KANJI1(c->str[c->n_any])) { + longchar k; + k = RE_KANJI(c->str + c->n_any); + if (regmatch1(c->re, k)) { + c->n_any += 2; + } + else { + return 0; + } + c->firstp = 0; } - else - break; - } - else #endif - { - k = (unsigned char)*p; - if (regmatch1(re, k)) { - p++; - if (lastpos != NULL) - *lastpos = llpos; + else { + longchar k; + k = (unsigned char)c->str[c->n_any]; + if (regmatch1(c->re, k)) { + c->n_any++; + } + else { + return 0; + } + c->firstp = 0; } - else - break; + } + else + c->n_any++; + if (RE_MODE(c->re + 1) == RE_ENDMARK) { + c->lastpos = c->str + c->n_any; + YIELD(1, c, 2); + } + else if (regmatch(c->re + 1, c->str + c->n_any, + c->firstp, &c->lastpos) == 1) { + YIELD(1, c, 3); } } - if (lastpos != NULL) - *lastpos = llpos; - return ok; - } - else if (re->mode & RE_END) { - if (lastpos != NULL) - *lastpos = p; - return (p >= ep); + return 0; } - else { - int a; + /* regexp other than pat*, pat+ and pat? */ + if (c->str >= c->end_p) + return 0; + switch (RE_MODE(c->re)) { + case RE_BEGIN: + if (!c->firstp) + return 0; + c->re++; + break; + case RE_END: + c->lastpos = c->str; + c->re++; + YIELD((c->str >= c->end_p), c, 4); + break; + case RE_SUBREGEX: + if (c->sub_ctx == NULL) { + c->sub_ctx = GC_malloc(sizeof(struct MatchingContext1)); + } + c->sub_regex = c->re->p.sub; + for (;;) { + c->sub_ctx->label = 0; + while (regmatch_iter(c->sub_ctx, c->sub_regex->re, + c->str, c->firstp)) { + if (c->sub_ctx->lastpos != c->str) + c->firstp = 0; + if (RE_MODE(c->re + 1) == RE_ENDMARK) { + c->lastpos = c->sub_ctx->lastpos; + YIELD(1, c, 5); + } + else if (regmatch(c->re + 1, c->sub_ctx->lastpos, + c->firstp, &c->lastpos) == 1) { + YIELD(1, c, 6); + } + } + if (c->sub_regex->alt_regex == NULL) + break; + c->sub_regex = c->sub_regex->alt_regex; + } + return 0; + default: #ifdef JP_CHARSET - if (IS_KANJI1(*p)) { - k = RE_KANJI(p); - p += 2; - a = regmatch1(re, k); + if (IS_KANJI1(*c->str)) { + longchar k; + k = RE_KANJI(c->str); + c->str += 2; + if (!regmatch1(c->re, k)) + return 0; } else #endif { - k = (unsigned char)*(p++); - a = regmatch1(re, k); + longchar k; + k = (unsigned char)*(c->str++); + if (!regmatch1(c->re, k)) + return 0; } - if (!a) - return 0; - else - re++; + c->re++; + c->firstp = 0; + } + } + c->lastpos = c->str; +#ifdef REGEX_DEBUG + if (verbose) + printf("Succeed: %s %d\n", c->str, c->lastpos - c->str); +#endif + YIELD(1, c, 7); + return 0; +} + +static int +regmatch(regexchar * re, char *str, int firstp, char **lastpos) +{ + struct MatchingContext1 contx; + + *lastpos = NULL; + + contx.label = 0; + while (regmatch_iter(&contx, re, str, firstp)) { +#ifdef REGEX_DEBUG + char *p; + if (verbose) { + printf("regmatch: matched <"); + for (p = str; p < contx.lastpos; p++) + putchar(*p); + printf(">\n"); } +#endif + if (*lastpos == NULL || *lastpos < contx.lastpos) + *lastpos = contx.lastpos; } - if (lastpos != NULL) - *lastpos = p; + if (*lastpos == NULL) + return 0; return 1; } + static int regmatch1(regexchar * re, longchar c) { - switch (re->mode & RE_MATCHMODE) { + switch (RE_MODE(re)) { case RE_ANY: #ifdef REGEX_DEBUG - printf("%c vs any. -> 1\n", c); + if (verbose) + printf("%c vs any. -> 1\n", c); #endif /* REGEX_DEBUG */ return 1; case RE_NORMAL: #ifdef REGEX_DEBUG - printf("RE=%c vs %c -> %d\n", *re->pattern, c, *re->pattern == c); + if (verbose) + printf("RE=%c vs %c -> %d\n", *re->p.pattern, c, + *re->p.pattern == c); #endif /* REGEX_DEBUG */ if (re->mode & RE_IGNCASE) { - if (*re->pattern < 127 && c < 127 && - IS_ALPHA(*re->pattern) && IS_ALPHA(c)) - return tolower(*re->pattern) == tolower(c); + if (*re->p.pattern < 127 && c < 127 && + IS_ALPHA(*re->p.pattern) && IS_ALPHA(c)) + return tolower(*re->p.pattern) == tolower(c); else - return *re->pattern == c; + return *re->p.pattern == c; } else - return (*re->pattern == c); + return (*re->p.pattern == c); case RE_WHICH: - return matchWhich(re->pattern, c); + return matchWhich(re->p.pattern, c); case RE_EXCEPT: - return !matchWhich(re->pattern, c); + return !matchWhich(re->p.pattern, c); } return 0; } @@ -325,7 +623,8 @@ matchWhich(longchar * pattern, longchar c) int ans = 0; #ifdef REGEX_DEBUG - printf("RE pattern = %s char=%c", pattern, c); + if (verbose) + printf("RE pattern = %s char=%s", lc2c(pattern, 10000), lc2c(&c, 1)); #endif /* REGEX_DEBUG */ while (*p != '\0') { if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') { /* Char class. */ @@ -344,64 +643,130 @@ matchWhich(longchar * pattern, longchar c) } } #ifdef REGEX_DEBUG - printf(" -> %d\n", ans); + if (verbose) + printf(" -> %d\n", ans); #endif /* REGEX_DEBUG */ return ans; } #ifdef REGEX_DEBUG char * -lc2c(longchar * x) +lc2c(longchar * x, int len) { static char y[100]; int i = 0; + char *r; - while (x[i]) { + while (x[i] && i < len) { if (x[i] == RE_WHICH_RANGE) - y[i] = '-'; + y[i++] = '-'; + else if (x[i] >= 128) { + y[i++] = ((x[i] >> 8) & 0xff); + y[i++] = (x[i] & 0xff); + } else - y[i] = x[i]; - i++; + y[i++] = x[i]; } y[i] = '\0'; - return y; + r = GC_malloc_atomic(i + 1); + strcpy(r, y); + return r; } void -debugre(re, s) - regexchar *re; - char *s; +debugre(regexchar * re) { - for (; !(re->mode & RE_ENDMARK); re++) { - if (re->mode & RE_BEGIN) { + for (; RE_MODE(re) != RE_ENDMARK; re++) { + switch (RE_MODE(re)) { + case RE_BEGIN: printf("Begin "); continue; - } - else if (re->mode & RE_END) { + case RE_END: printf("End "); continue; } if (re->mode & RE_ANYTIME) printf("Anytime-"); + if (re->mode & RE_OPT) + printf("Opt-"); - switch (re->mode & RE_MATCHMODE) { + switch (RE_MODE(re)) { case RE_ANY: printf("Any "); break; case RE_NORMAL: - printf("Match-to'%c' ", *re->pattern); + printf("Match-to'%c' ", *re->p.pattern); break; case RE_WHICH: - printf("One-of\"%s\" ", lc2c(re->pattern)); + printf("One-of\"%s\" ", lc2c(re->p.pattern, 10000)); break; case RE_EXCEPT: - printf("Other-than\"%s\" ", lc2c(re->pattern)); + printf("Other-than\"%s\" ", lc2c(re->p.pattern, 10000)); break; + case RE_SUBREGEX: + { + Regex *r = re->p.sub; + printf("("); + while (r) { + debugre(r->re); + if (r->alt_regex) + printf(" | "); + r = r->alt_regex; + } + printf(")"); + break; + } default: printf("Unknown "); } } - putchar('\n'); } #endif /* REGEX_DEBUG */ + +#ifdef REGEXTEST +int +main(int argc, char **argv) +{ + char buf[128], buf2[128]; + char *msg; + Regex *re; + char *fpos, *epos; + FILE *f = stdin; + int i = 1; + +#ifdef REGEX_DEBUG + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-v") == 0) + verbose = 1; + else + break; + } +#endif + + if (argc > i) + f = fopen(argv[i], "r"); + if (f == NULL) { + fprintf(stderr, "Can't open %s\n", argv[i]); + exit(1); + } + while (fscanf(f, "%s%s", buf, buf2) == 2) { + re = newRegex(buf, 0, NULL, &msg); + if (re == NULL) { + printf("Error on regexp /%s/: %s\n", buf, msg); + exit(1); + } + if (RegexMatch(re, buf2, 0, 1)) { + printf("/%s/\t%s\t", buf, buf2); + MatchedPosition(re, &fpos, &epos); + while (fpos < epos) + putchar(*(fpos++)); + } + else + printf("/%s/\t%s\tno_match", buf, buf2); + putchar('\n'); + } + /* notreatched */ + return 0; +} +#endif |