diff options
author | Tatsuya Kinoshita <tats@vega.ocn.ne.jp> | 2011-05-04 07:05:14 +0000 |
---|---|---|
committer | Tatsuya Kinoshita <tats@vega.ocn.ne.jp> | 2011-05-04 07:05:14 +0000 |
commit | 72f72d64a422d6628c4796f5c0bf2e508f134214 (patch) | |
tree | 0c9ea90cc53310832c977265521fb44db24a515e /regex.c | |
parent | Adding upstream version 0.3 (diff) | |
download | w3m-72f72d64a422d6628c4796f5c0bf2e508f134214.tar.gz w3m-72f72d64a422d6628c4796f5c0bf2e508f134214.zip |
Adding upstream version 0.5.1upstream/0.5.1
Diffstat (limited to 'regex.c')
-rw-r--r-- | regex.c | 842 |
1 files changed, 842 insertions, 0 deletions
@@ -0,0 +1,842 @@ +/* $Id: regex.c,v 1.22 2003/09/24 18:49:00 ukai Exp $ */ +/* + * regex: Regular expression pattern match library + * + * by A.ITO, December 1989 + * Revised by A.ITO, January 2002 + */ + +#ifdef REGEX_DEBUG +#include <sys/types.h> +#include <malloc.h> +#endif /* REGEX_DEBUG */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <gc.h> +#include "config.h" +#ifdef USE_M17N +#include "wc.h" +#include "wtf.h" +#ifdef USE_UNICODE +#include "ucs.h" +#endif +#endif +#include "regex.h" +#include "config.h" +#include "myctype.h" + +#ifndef NULL +#define NULL 0 +#endif /* not NULL */ + +#define RE_ITER_LIMIT 65535 + +#define RE_MATCHMODE 0x07 +#define RE_NORMAL 0x00 +#define RE_ANY 0x01 +#define RE_WHICH 0x02 +#define RE_EXCEPT 0x03 +#define RE_SUBREGEX 0x04 +#define RE_BEGIN 0x05 +#define RE_END 0x06 +#define RE_ENDMARK 0x07 + +#define RE_OPT 0x08 +#define RE_ANYTIME 0x10 +#define RE_IGNCASE 0x40 + +#define RE_MODE(x) ((x)->mode&RE_MATCHMODE) +#define RE_SET_MODE(x,v) ((x)->mode = (((x)->mode&~RE_MATCHMODE)|((v)&RE_MATCHMODE))) + +#ifdef REGEX_DEBUG +void debugre(regexchar *); +char *lc2c(longchar *, int); +int verbose; +#endif /* REGEX_DEBUG */ + +#ifdef USE_M17N +#define get_mclen(c) wtf_len1((wc_uchar *)(c)) +#else +#define get_mclen(c) 1 +#endif + +#ifndef TOLOWER +#include <ctype.h> +#define TOLOWER(x) tolower(x) +#define TOUPPER(x) toupper(x) +#endif + +#define RE_TYPE_END 0 +#define RE_TYPE_CHAR 1 +#define RE_TYPE_WCHAR_T 2 +#define RE_WHICH_RANGE 3 +#define RE_TYPE_SYMBOL 4 + +static longchar +set_longchar(char *str) +{ + unsigned char *p = (unsigned char *)str; + longchar r; + +#ifdef USE_M17N + if (*p & 0x80) { + r.wch = wtf_parse1(&p); + if (r.wch.ccs == WC_CCS_SPECIAL || r.wch.ccs == WC_CCS_SPECIAL_W) { + r.type = RE_TYPE_SYMBOL; + return r; + } +#ifdef USE_UNICODE + if (WC_CCS_IS_UNICODE(r.wch.ccs)) { + if (WC_CCS_SET(r.wch.ccs) == WC_CCS_UCS_TAG) + r.wch.code = wc_ucs_tag_to_ucs(r.wch.code); + r.wch.ccs = WC_CCS_UCS4; + } + else +#endif + r.wch.ccs = WC_CCS_SET(r.wch.ccs); + r.type = RE_TYPE_WCHAR_T; + return r; + } +#endif + r.ch = *p; + r.type = RE_TYPE_CHAR; + return r; +} + +static Regex DefaultRegex; +#define CompiledRegex DefaultRegex.re +#define Cstorage DefaultRegex.storage + +static int regmatch(regexchar *, char *, char *, int, char **); +static int regmatch1(regexchar *, longchar *); +static int matchWhich(longchar *, longchar *, int); +static int match_longchar(longchar *, longchar *, int); +static int match_range_longchar(longchar *, longchar *, longchar *, int); + +/* + * regexCompile: compile regular expression + */ +char * +regexCompile(char *ex, int igncase) +{ + char *msg; + newRegex(ex, igncase, &DefaultRegex, &msg); + return msg; +} + +static Regex * +newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level) +{ + char *p; + longchar *r; + regexchar *re; + int m; + longchar *st_ptr; + + if (regex == NULL) + regex = (Regex *)GC_malloc(sizeof(Regex)); + regex->alt_regex = NULL; + re = regex->re; + st_ptr = regex->storage; + for (p = *ex; *p != '\0'; p++) { + re->mode = 0; + switch (*p) { + case '.': + re->p.pattern = NULL; + RE_SET_MODE(re, RE_ANY); + re++; + break; + case '$': + re->p.pattern = NULL; + RE_SET_MODE(re, RE_END); + re++; + break; + case '^': + re->p.pattern = NULL; + RE_SET_MODE(re, RE_BEGIN); + re++; + break; + case '+': + if (re == regex->re || + (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) { + if (msg) + *msg = "Invalid regular expression"; + return NULL; + } + *re = *(re - 1); + re->mode |= RE_ANYTIME; + re++; + break; + case '*': + if (re == regex->re || + (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) { + if (msg) + *msg = "Invalid regular expression"; + return NULL; + } + (re - 1)->mode |= RE_ANYTIME; + break; + case '?': + if (re == regex->re || + (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) { + if (msg) + *msg = "Invalid regular expression"; + return NULL; + } + (re - 1)->mode |= RE_OPT; + break; + case '[': + r = st_ptr; + if (*++p == '^') { + p++; + m = RE_EXCEPT; + } + else + m = RE_WHICH; + if (*p == '-' || *p == ']') + *(st_ptr++) = set_longchar(p); + while (*p != ']') { + if (*p == '\\') { + p++; + *(st_ptr++) = set_longchar(p); + p += get_mclen(p); + } + else if (*p == '-' && *(p + 1) != ']') { + (st_ptr++)->type = RE_WHICH_RANGE; + p++; + } + else if (*p == '\0') { + if (msg) + *msg = "Missing ]"; + return NULL; + } + else { + *(st_ptr++) = set_longchar(p); + p += get_mclen(p); + } + if (st_ptr >= ®ex->storage[STORAGE_MAX]) { + if (msg) + *msg = "Regular expression too long"; + return NULL; + } + } + (st_ptr++)->type = RE_TYPE_END; + re->p.pattern = r; + RE_SET_MODE(re, m); + if (igncase) + re->mode |= RE_IGNCASE; + re++; + break; + case '|': + RE_SET_MODE(re, RE_ENDMARK); + re++; + p++; + regex->alt_regex = newRegex0(&p, igncase, NULL, msg, level); + if (regex->alt_regex == NULL) + return NULL; + *ex = p; + return regex; + case '(': + RE_SET_MODE(re, RE_SUBREGEX); + p++; + re->p.sub = newRegex0(&p, igncase, NULL, msg, level + 1); + if (re->p.sub == NULL) + return NULL; + re++; + break; + case ')': + if (level == 0) { + if (msg) + *msg = "Too many ')'"; + return NULL; + } + RE_SET_MODE(re, RE_ENDMARK); + re++; + *ex = p; + return regex; + case '\\': + p++; + default: + *(st_ptr) = set_longchar(p); + p += get_mclen(p) - 1; + re->p.pattern = st_ptr; + st_ptr++; + RE_SET_MODE(re, RE_NORMAL); + if (igncase) + re->mode |= RE_IGNCASE; + re++; + } + if (st_ptr >= ®ex->storage[STORAGE_MAX] || + re >= ®ex->re[REGEX_MAX]) { + if (msg) + *msg = "Regular expression too long"; + return NULL; + } + } + RE_SET_MODE(re, RE_ENDMARK); + if (msg) + *msg = NULL; + *ex = p; + return regex; +} + +Regex * +newRegex(char *ex, int igncase, Regex *regex, char **msg) +{ + return newRegex0(&ex, igncase, regex, msg, 0); +} + +/* + * regexMatch: match regular expression + */ +int +regexMatch(char *str, int len, int firstp) +{ + return RegexMatch(&DefaultRegex, str, len, firstp); +} + +int +RegexMatch(Regex *re, char *str, int len, int firstp) +{ + char *p, *ep; + char *lpos; + Regex *r; + + if (str == NULL) + return 0; + if (len < 0) + len = strlen(str); + re->position = NULL; + ep = str + len; + for (p = str; p <= ep; p++) { + lpos = NULL; + re->lposition = NULL; + for (r = re; r != NULL; r = r->alt_regex) { + switch (regmatch(r->re, p, ep, firstp && (p == str), &lpos)) { + case 1: /* matched */ + re->position = p; + if (re->lposition == NULL || re->lposition < lpos) + re->lposition = lpos; + break; + case -1: /* error */ + re->position = NULL; + return -1; + } + } + if (re->lposition != NULL) { + /* matched */ + return 1; + } + p += get_mclen(p) - 1; + } + return 0; +} + +/* + * matchedPosition: last matched position + */ +void +MatchedPosition(Regex *re, char **first, char **last) +{ + *first = re->position; + *last = re->lposition; +} + +void +matchedPosition(char **first, char **last) +{ + *first = DefaultRegex.position; + *last = DefaultRegex.lposition; +} + +/* + * Intermal routines + */ + +struct MatchingContext1 { + int label; + regexchar *re; + char *lastpos; + char *str; + int iter_limit; + int n_any; + int firstp; + char *end_p; + Regex *sub_regex; + struct MatchingContext1 *sub_ctx; + struct MatchingContext2 *ctx2; +}; + +struct MatchingContext2 { + int label; + Regex *regex; + char *lastpos; + struct MatchingContext1 *ctx; + struct MatchingContext2 *ctx2; + char *str; + int n_any; + int firstp; +}; + + +#define YIELD(retval,context,lnum) (context)->label = lnum; return (retval); label##lnum: + +static int regmatch_iter(struct MatchingContext1 *, + regexchar *, char *, char *, int); + +static int +regmatch_sub_anytime(struct MatchingContext2 *c, Regex *regex, + regexchar * pat2, + char *str, char *end_p, int iter_limit, int firstp) +{ + switch (c->label) { + case 1: + goto label1; + case 2: + goto label2; + case 3: + goto label3; + } + c->ctx = GC_malloc(sizeof(struct MatchingContext1)); + c->ctx2 = GC_malloc(sizeof(struct MatchingContext2)); + c->ctx->label = 0; + c->regex = regex; + c->n_any = 0; + c->str = str; + c->firstp = firstp; + for (;;) { + c->ctx->label = 0; + while (regmatch_iter(c->ctx, c->regex->re, c->str, end_p, c->firstp)) { + c->n_any = c->ctx->lastpos - c->str; + if (c->n_any <= 0) + continue; + c->firstp = 0; + if (RE_MODE(pat2) == RE_ENDMARK) { + c->lastpos = c->str + c->n_any; + YIELD(1, c, 1); + } + else if (regmatch(pat2, c->str + c->n_any, end_p, + c->firstp, &c->lastpos) == 1) { + YIELD(1, c, 2); + } + if (iter_limit == 1) + continue; + c->ctx2->label = 0; + while (regmatch_sub_anytime(c->ctx2, regex, pat2, + c->str + c->n_any, end_p, + iter_limit - 1, c->firstp)) { + + c->lastpos = c->ctx2->lastpos; + YIELD(1, c, 3); + } + } + if (c->regex->alt_regex == NULL) + break; + c->regex = c->regex->alt_regex; + } + return 0; +} + +static int +regmatch_iter(struct MatchingContext1 *c, + regexchar * re, char *str, char *end_p, int firstp) +{ + switch (c->label) { + case 1: + goto label1; + case 2: + goto label2; + case 3: + goto label3; + case 4: + goto label4; + case 5: + goto label5; + case 6: + goto label6; + case 7: + goto label7; + } + if (RE_MODE(re) == RE_ENDMARK) + return 0; + c->re = re; + c->firstp = firstp; + c->str = str; + c->end_p = end_p; + c->sub_ctx = NULL; + c->lastpos = NULL; + while (RE_MODE(c->re) != RE_ENDMARK) { + if (c->re->mode & (RE_ANYTIME | RE_OPT)) { + if (c->re->mode & RE_ANYTIME) + c->iter_limit = RE_ITER_LIMIT; + else + c->iter_limit = 1; + c->n_any = -1; + while (c->n_any < c->iter_limit) { + if (c->str + c->n_any >= c->end_p) { + return 0; + } + if (c->n_any >= 0) { + if (RE_MODE(c->re) == RE_SUBREGEX) { + c->ctx2 = GC_malloc(sizeof(struct MatchingContext2)); + c->ctx2->label = 0; + while (regmatch_sub_anytime(c->ctx2, + c->re->p.sub, + c->re + 1, + c->str + c->n_any, + c->end_p, + c->iter_limit, + c->firstp)) { + c->n_any = c->ctx2->lastpos - c->str; + c->lastpos = c->ctx2->lastpos; + YIELD(1, c, 1); + } + return 0; + } + else { + longchar k; + k = set_longchar(c->str + c->n_any); + if (regmatch1(c->re, &k)) { + c->n_any += get_mclen(c->str + c->n_any); + } + else { + return 0; + } + c->firstp = 0; + } + } + else + c->n_any++; + if (RE_MODE(c->re + 1) == RE_ENDMARK) { + c->lastpos = c->str + c->n_any; + YIELD(1, c, 2); + } + else if (regmatch(c->re + 1, c->str + c->n_any, c->end_p, + c->firstp, &c->lastpos) == 1) { + YIELD(1, c, 3); + } + } + return 0; + } + /* regexp other than pat*, pat+ and pat? */ + switch (RE_MODE(c->re)) { + case RE_BEGIN: + if (!c->firstp) + return 0; + c->re++; + break; + case RE_END: + if (c->str >= c->end_p) { + c->lastpos = c->str; + c->re++; + YIELD(1, c, 4); + } + else { + c->lastpos = NULL; + return 0; + } + break; + case RE_SUBREGEX: + if (c->sub_ctx == NULL) { + c->sub_ctx = GC_malloc(sizeof(struct MatchingContext1)); + } + c->sub_regex = c->re->p.sub; + for (;;) { + c->sub_ctx->label = 0; + while (regmatch_iter(c->sub_ctx, c->sub_regex->re, + c->str, c->end_p, c->firstp)) { + if (c->sub_ctx->lastpos != c->str) + c->firstp = 0; + if (RE_MODE(c->re + 1) == RE_ENDMARK) { + c->lastpos = c->sub_ctx->lastpos; + YIELD(1, c, 5); + } + else if (regmatch(c->re + 1, c->sub_ctx->lastpos, c->end_p, + c->firstp, &c->lastpos) == 1) { + YIELD(1, c, 6); + } + } + if (c->sub_regex->alt_regex == NULL) + break; + c->sub_regex = c->sub_regex->alt_regex; + } + return 0; + default: + { + longchar k; + k = set_longchar(c->str); + c->str += get_mclen(c->str); + if (!regmatch1(c->re, &k)) + return 0; + } + c->re++; + c->firstp = 0; + } + if (c->str > c->end_p) { + return 0; + } + } + c->lastpos = c->str; +#ifdef REGEX_DEBUG + if (verbose) + printf("Succeed: %s %d\n", c->str, c->lastpos - c->str); +#endif + YIELD(1, c, 7); + return 0; +} + +static int +regmatch(regexchar * re, char *str, char *end_p, int firstp, char **lastpos) +{ + struct MatchingContext1 contx; + + *lastpos = NULL; + + contx.label = 0; + while (regmatch_iter(&contx, re, str, end_p, firstp)) { +#ifdef REGEX_DEBUG + char *p; + if (verbose) { + printf("regmatch: matched <"); + for (p = str; p < contx.lastpos; p++) + putchar(*p); + printf(">\n"); + } +#endif + if (*lastpos == NULL || *lastpos < contx.lastpos) + *lastpos = contx.lastpos; + } + if (*lastpos == NULL) + return 0; + return 1; +} + + +static int +regmatch1(regexchar * re, longchar * c) +{ + int ans; + +#ifdef USE_M17N + if (c->type == RE_TYPE_SYMBOL) + return 0; +#endif + switch (RE_MODE(re)) { + case RE_ANY: +#ifdef REGEX_DEBUG + if (verbose) + printf("%s vs any. -> 1\n", lc2c(c, 1)); +#endif /* REGEX_DEBUG */ + return 1; + case RE_NORMAL: + ans = match_longchar(re->p.pattern, c, re->mode & RE_IGNCASE); +#ifdef REGEX_DEBUG + if (verbose) + printf("RE=%s vs %s -> %d\n", lc2c(re->p.pattern, 1), lc2c(c, 1), + ans); +#endif /* REGEX_DEBUG */ + return ans; + case RE_WHICH: + return matchWhich(re->p.pattern, c, re->mode & RE_IGNCASE); + case RE_EXCEPT: + return !matchWhich(re->p.pattern, c, re->mode & RE_IGNCASE); + } + return 0; +} + +static int +matchWhich(longchar * pattern, longchar * c, int igncase) +{ + longchar *p = pattern; + int ans = 0; + +#ifdef REGEX_DEBUG + if (verbose) + printf("RE pattern = %s char=%s", lc2c(pattern, 10000), lc2c(c, 1)); +#endif /* REGEX_DEBUG */ + while (p->type != RE_TYPE_END) { + if ((p + 1)->type == RE_WHICH_RANGE && (p + 2)->type != RE_TYPE_END) { + if (match_range_longchar(p, p + 2, c, igncase)) { + ans = 1; + break; + } + p += 3; + } + else { + if (match_longchar(p, c, igncase)) { + ans = 1; + break; + } + p++; + } + } +#ifdef REGEX_DEBUG + if (verbose) + printf(" -> %d\n", ans); +#endif /* REGEX_DEBUG */ + return ans; +} + +static int +match_longchar(longchar * a, longchar * b, int ignore) +{ +#ifdef USE_M17N + if (a->type != b->type) + return 0; + if (a->type == RE_TYPE_WCHAR_T) + return (a->wch.ccs == b->wch.ccs) && (a->wch.code == b->wch.code); +#endif + if (ignore && IS_ALPHA(b->ch)) + return (a->ch == TOLOWER(b->ch) || a->ch == TOUPPER(b->ch)); + else + return a->ch == b->ch; +} + +static int +match_range_longchar(longchar * a, longchar * b, longchar * c, int ignore) +{ +#ifdef USE_M17N + if (a->type != b->type || a->type != c->type) + return 0; + if (a->type == RE_TYPE_WCHAR_T) + return ((a->wch.ccs == c->wch.ccs && c->wch.ccs == b->wch.ccs) && + (a->wch.code <= c->wch.code && c->wch.code <= b->wch.code)); +#endif + if (ignore && IS_ALPHA(c->ch)) + return ((a->ch <= TOLOWER(c->ch) && TOLOWER(c->ch) <= b->ch) || + (a->ch <= TOUPPER(c->ch) && TOUPPER(c->ch) <= b->ch)); + else + return (a->ch <= c->ch && c->ch <= b->ch); +} + +#ifdef REGEX_DEBUG +char * +lc2c(longchar * x, int len) +{ + static char y[100]; + int i = 0, j = 0; + char *r; + + while (x[j].type != RE_TYPE_END && j < len) { + if (x[j].type == RE_WHICH_RANGE) + y[i++] = '-'; +#ifdef USE_M17N + else if (x[j].type == RE_TYPE_WCHAR_T) { + char buf[20]; + sprintf(buf, "[%x-%x]", x[j].wch.ccs, x[j].wch.code); + strcpy(&y[i], buf); + i += strlen(buf); + } +#endif + else + y[i++] = x[j].ch; + j++; + } + y[i] = '\0'; + r = GC_malloc_atomic(i + 1); + strcpy(r, y); + return r; +} + +void +debugre(regexchar * re) +{ + for (; RE_MODE(re) != RE_ENDMARK; re++) { + switch (RE_MODE(re)) { + case RE_BEGIN: + printf("Begin "); + continue; + case RE_END: + printf("End "); + continue; + } + if (re->mode & RE_ANYTIME) + printf("Anytime-"); + if (re->mode & RE_OPT) + printf("Opt-"); + + switch (RE_MODE(re)) { + case RE_ANY: + printf("Any "); + break; + case RE_NORMAL: + printf("Match-to'%c' ", *re->p.pattern); + break; + case RE_WHICH: + printf("One-of\"%s\" ", lc2c(re->p.pattern, 10000)); + break; + case RE_EXCEPT: + printf("Other-than\"%s\" ", lc2c(re->p.pattern, 10000)); + break; + case RE_SUBREGEX: + { + Regex *r = re->p.sub; + printf("("); + while (r) { + debugre(r->re); + if (r->alt_regex) + printf(" | "); + r = r->alt_regex; + } + printf(")"); + break; + } + default: + printf("Unknown "); + } + } +} + +#endif /* REGEX_DEBUG */ + +#ifdef REGEXTEST +int +main(int argc, char **argv) +{ + char buf[128], buf2[128]; + char *msg; + Regex *re; + char *fpos, *epos; + FILE *f = stdin; + int i = 1; + +#ifdef USE_M17N + wtf_init(WC_CES_EUC_JP, WC_CES_EUC_JP); +#endif +#ifdef REGEX_DEBUG + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-v") == 0) + verbose = 1; + else + break; + } +#endif + + if (argc > i) + f = fopen(argv[i], "r"); + if (f == NULL) { + fprintf(stderr, "Can't open %s\n", argv[i]); + exit(1); + } + while (fscanf(f, "%s%s", buf, buf2) == 2) { + re = newRegex(buf, 0, NULL, &msg); + if (re == NULL) { + printf("Error on regexp /%s/: %s\n", buf, msg); + exit(1); + } + if (RegexMatch(re, buf2, -1, 1)) { + printf("/%s/\t\"%s\"\t\"", buf, buf2); + MatchedPosition(re, &fpos, &epos); + while (fpos < epos) + putchar(*(fpos++)); + putchar('"'); + } + else + printf("/%s/\t\"%s\"\tno_match", buf, buf2); + putchar('\n'); + } + /* notreatched */ + return 0; +} +#endif |