diff options
author | Akinori Ito <aito@eie.yz.yamagata-u.ac.jp> | 2001-11-08 05:14:08 +0000 |
---|---|---|
committer | Akinori Ito <aito@eie.yz.yamagata-u.ac.jp> | 2001-11-08 05:14:08 +0000 |
commit | 68a07bf03b7624c9924065cce9ffa45497225834 (patch) | |
tree | c2adb06a909a8594445e4a3f8587c4bad46e3ecd /regex.c | |
download | w3m-68a07bf03b7624c9924065cce9ffa45497225834.tar.gz w3m-68a07bf03b7624c9924065cce9ffa45497225834.zip |
Initial revision
Diffstat (limited to 'regex.c')
-rw-r--r-- | regex.c | 433 |
1 files changed, 433 insertions, 0 deletions
@@ -0,0 +1,433 @@ +/* + * regex: Regular expression pattern match library + * + * by A.ITO, December 1989 + */ + +#ifdef REGEX_DEBUG +#include <sys/types.h> +#include <malloc.h> +#endif /* REGEX_DEBUG */ +#include <ctype.h> +#include <gc.h> +#ifdef __EMX__ +#include <strings.h> +#endif +#include "fm.h" +#include "regex.h" + +#ifdef JP_CHARSET +#define RE_KANJI(p) (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1)) +#endif + +#define RE_WHICH_RANGE 0xffff + +static Regex DefaultRegex; +#define CompiledRegex DefaultRegex.re +#define Cstorage DefaultRegex.storage + +static longchar *st_ptr; + +static int regmatch(regexchar *, char *, int, int, char **); +static int regmatch1(regexchar *, longchar); +static int matchWhich(longchar *, longchar); + + +/* + * regexCompile: compile regular expression + */ +char * +regexCompile(char *ex, int igncase) +{ + char *msg; + newRegex(ex, igncase, &DefaultRegex, &msg); + return msg; +} + +Regex * +newRegex(char *ex, int igncase, Regex * regex, char **msg) +{ + char *p; + longchar *r; + regexchar *re = regex->re - 1; + int m; + + if (regex == 0) + regex = (Regex *) GC_malloc_atomic(sizeof(Regex)); + st_ptr = regex->storage; + for (p = ex; *p != '\0'; p++) { + switch (*p) { + case '.': + re++; + re->pattern = NULL; + re->mode = RE_ANY; + break; + case '$': + re++; + re->pattern = NULL; + re->mode = RE_END; + break; + case '^': + re++; + re->pattern = NULL; + re->mode = RE_BEGIN; + break; + case '*': + if (!(re->mode & RE_ANY) && re->pattern == NULL) { + if (msg) + *msg = "Invalid regular expression"; + return NULL; + } + re->mode |= RE_ANYTIME; + break; + case '[': + r = st_ptr; + if (*++p == '^') { + p++; + m = RE_EXCEPT; + } + else + m = RE_WHICH; + while (*p != ']') { + if (*p == '\\') { + *(st_ptr++) = *(p + 1); + p += 2; + } + else if (*p == '-') { + *(st_ptr++) = RE_WHICH_RANGE; + p++; + } + else if (*p == '\0') { + if (msg) + *msg = "Missing ]"; + return NULL; + } +#ifdef JP_CHARSET + else if (IS_KANJI1(*p)) { + *(st_ptr++) = RE_KANJI(p); + p += 2; + } +#endif + else + *(st_ptr++) = (unsigned char)*(p++); + } + *(st_ptr++) = '\0'; + re++; + re->pattern = r; + re->mode = m; + break; + case '\\': + p++; + default: + re++; +#ifdef JP_CHARSET + if (IS_KANJI1(*p)) { + *(st_ptr) = RE_KANJI(p); + p++; + } + else +#endif + *st_ptr = (unsigned char)*p; + re->pattern = st_ptr; + st_ptr++; + re->mode = RE_NORMAL; + if (igncase) + re->mode |= RE_IGNCASE; + } + if (st_ptr >= &Cstorage[STORAGE_MAX] || + re >= &CompiledRegex[REGEX_MAX]) { + if (msg) + *msg = "Regular expression too long"; + return NULL; + } + } + re++; + re->mode = RE_ENDMARK; + if (msg) + *msg = NULL; + return regex; +} + +/* + * regexMatch: match regular expression + */ +int +regexMatch(char *str, int len, int firstp) +{ + return RegexMatch(&DefaultRegex, str, len, firstp); +} + +int +RegexMatch(Regex * re, char *str, int len, int firstp) +{ + char *p, *ep; + + if (str == NULL) + return 0; + re->position = NULL; + ep = str + ((len == 0) ? strlen(str) : len); + for (p = str; p < ep; p++) { + switch (regmatch(re->re, p, ep - p, firstp && (p == str), &re->lposition)) { + case 1: + re->position = p; + return 1; + case -1: + re->position = NULL; + return -1; + } +#ifdef JP_CHARSET + if (IS_KANJI1(*p)) + p++; +#endif + } + return 0; +} + +/* + * matchedPosition: last matched position + */ +void +MatchedPosition(Regex * re, char **first, char **last) +{ + *first = re->position; + *last = re->lposition; +} + +void +matchedPosition(char **first, char **last) +{ + *first = DefaultRegex.position; + *last = DefaultRegex.lposition; +} + +/* + * Intermal routines + */ +static int +regmatch(regexchar * re, char *str, int len, int firstp, char **lastpos) +{ + char *p = str, *ep = str + len; + char *lpos, *llpos = NULL; + longchar k; + +#ifdef REGEX_DEBUG + debugre(re, str); +#endif /* REGEX_DEBUG */ + while ((re->mode & RE_ENDMARK) == 0) { + if (re->mode & RE_BEGIN) { + if (!firstp) + return 0; + re++; + } + else if (re->mode & RE_ANYTIME) { + short matched = 0, ok = 0; + do { + if (regmatch(re + 1, p, ep - p, firstp, &lpos) == 1) { + llpos = lpos; + matched = 1; + } + else if (matched) { + ok = 1; + break; + } + if (p >= ep) { + if (matched) + ok = 1; + break; + } +#ifdef JP_CHARSET + if (IS_KANJI1(*p)) { + k = RE_KANJI(p); + if (regmatch1(re, k)) { + if (lastpos != NULL) + *lastpos = llpos; + p += 2; + } + else if (matched) + ok = 1; + else + break; + } + else +#endif + { + k = (unsigned char)*p; + if (regmatch1(re, k)) { + p++; + if (lastpos != NULL) + *lastpos = llpos; + } + else if (matched) + ok = 1; + else + break; + } + } while (!ok); + if (lastpos != NULL) + *lastpos = llpos; + return ok; + } + else if (re->mode & RE_END) { + if (lastpos != NULL) + *lastpos = p; + return (p >= ep); + } + else { + int a; +#ifdef JP_CHARSET + if (IS_KANJI1(*p)) { + k = RE_KANJI(p); + p += 2; + a = regmatch1(re, k); + } + else +#endif + { + k = (unsigned char)*(p++); + a = regmatch1(re, k); + } + if (!a) + return 0; + else + re++; + } + } + if (lastpos != NULL) + *lastpos = p; + return 1; +} + +static int +regmatch1(regexchar * re, longchar c) +{ + switch (re->mode & RE_MATCHMODE) { + case RE_ANY: +#ifdef REGEX_DEBUG + printf("%c vs any. -> 1\n", c); +#endif /* REGEX_DEBUG */ + return 1; + case RE_NORMAL: +#ifdef REGEX_DEBUG + printf("RE=%c vs %c -> %d\n", *re->pattern, c, *re->pattern == c); +#endif /* REGEX_DEBUG */ + if (re->mode & RE_IGNCASE) { + if (*re->pattern < 127 && c < 127 && + IS_ALPHA(*re->pattern) && IS_ALPHA(c)) + return tolower(*re->pattern) == tolower(c); + else + return *re->pattern == c; + } + else + return (*re->pattern == c); + case RE_WHICH: + return matchWhich(re->pattern, c); + case RE_EXCEPT: + return !matchWhich(re->pattern, c); + } + return 0; +} + +static int +matchWhich(longchar * pattern, longchar c) +{ + longchar *p = pattern; + int ans = 0; + +#ifdef REGEX_DEBUG + printf("RE pattern = %s char=%c", pattern, c); +#endif /* REGEX_DEBUG */ + while (*p != '\0') { + if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') { /* Char * + * + * * * * + * * * * * + * * * * + * * * * + * * * * + * * * * + * * * * + * * * * + * * * * * + * * * * * + * * * + * class. + * * * * * + * * * * * + * * * * * + */ + if (*p <= c && c <= *(p + 2)) { + ans = 1; + break; + } + p += 3; + } + else { + if (*p == c) { + ans = 1; + break; + } + p++; + } + } +#ifdef REGEX_DEBUG + printf(" -> %d\n", ans); +#endif /* REGEX_DEBUG */ + return ans; +} + +#ifdef REGEX_DEBUG +char * +lc2c(longchar * x) +{ + static char y[100]; + int i = 0; + + while (x[i]) { + if (x[i] == RE_WHICH_RANGE) + y[i] = '-'; + else + y[i] = x[i]; + i++; + } + y[i] = '\0'; + return y; +} + +void +debugre(re, s) + regexchar *re; + char *s; +{ + for (; !(re->mode & RE_ENDMARK); re++) { + if (re->mode & RE_BEGIN) { + printf("Begin "); + continue; + } + else if (re->mode & RE_END) { + printf("End "); + continue; + } + if (re->mode & RE_ANYTIME) + printf("Anytime-"); + + switch (re->mode & RE_MATCHMODE) { + case RE_ANY: + printf("Any "); + break; + case RE_NORMAL: + printf("Match-to'%c' ", *re->pattern); + break; + case RE_WHICH: + printf("One-of\"%s\" ", lc2c(re->pattern)); + break; + case RE_EXCEPT: + printf("Other-than\"%s\" ", lc2c(re->pattern)); + break; + default: + printf("Unknown "); + } + } + putchar('\n'); +} + +#endif /* REGEX_DEBUG */ |