/* $Id: regex.c,v 1.5 2001/11/24 02:01:26 ukai Exp $ */ /* * regex: Regular expression pattern match library * * by A.ITO, December 1989 */ #ifdef REGEX_DEBUG #include #include #endif /* REGEX_DEBUG */ #include #include #include "fm.h" #include "regex.h" #ifdef JP_CHARSET #define RE_KANJI(p) (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1)) #endif #define RE_WHICH_RANGE 0xffff static Regex DefaultRegex; #define CompiledRegex DefaultRegex.re #define Cstorage DefaultRegex.storage static longchar *st_ptr; static int regmatch(regexchar *, char *, int, int, char **); static int regmatch1(regexchar *, longchar); static int matchWhich(longchar *, longchar); /* * regexCompile: compile regular expression */ char * regexCompile(char *ex, int igncase) { char *msg; newRegex(ex, igncase, &DefaultRegex, &msg); return msg; } Regex * newRegex(char *ex, int igncase, Regex *regex, char **msg) { char *p; longchar *r; regexchar *re = regex->re - 1; int m; if (regex == 0) regex = (Regex *)GC_malloc_atomic(sizeof(Regex)); st_ptr = regex->storage; for (p = ex; *p != '\0'; p++) { switch (*p) { case '.': re++; re->pattern = NULL; re->mode = RE_ANY; break; case '$': re++; re->pattern = NULL; re->mode = RE_END; break; case '^': re++; re->pattern = NULL; re->mode = RE_BEGIN; break; case '*': if (!(re->mode & RE_ANY) && re->pattern == NULL) { if (msg) *msg = "Invalid regular expression"; return NULL; } re->mode |= RE_ANYTIME; break; case '[': r = st_ptr; if (*++p == '^') { p++; m = RE_EXCEPT; } else m = RE_WHICH; while (*p != ']') { if (*p == '\\') { *(st_ptr++) = *(p + 1); p += 2; } else if (*p == '-') { *(st_ptr++) = RE_WHICH_RANGE; p++; } else if (*p == '\0') { if (msg) *msg = "Missing ]"; return NULL; } #ifdef JP_CHARSET else if (IS_KANJI1(*p)) { *(st_ptr++) = RE_KANJI(p); p += 2; } #endif else *(st_ptr++) = (unsigned char)*(p++); } *(st_ptr++) = '\0'; re++; re->pattern = r; re->mode = m; break; case '\\': p++; default: re++; #ifdef JP_CHARSET if (IS_KANJI1(*p)) { *(st_ptr) = RE_KANJI(p); p++; } else #endif *st_ptr = (unsigned char)*p; re->pattern = st_ptr; st_ptr++; re->mode = RE_NORMAL; if (igncase) re->mode |= RE_IGNCASE; } if (st_ptr >= &Cstorage[STORAGE_MAX] || re >= &CompiledRegex[REGEX_MAX]) { if (msg) *msg = "Regular expression too long"; return NULL; } } re++; re->mode = RE_ENDMARK; if (msg) *msg = NULL; return regex; } /* * regexMatch: match regular expression */ int regexMatch(char *str, int len, int firstp) { return RegexMatch(&DefaultRegex, str, len, firstp); } int RegexMatch(Regex *re, char *str, int len, int firstp) { char *p, *ep; if (str == NULL) return 0; re->position = NULL; ep = str + ((len == 0) ? strlen(str) : len); for (p = str; p < ep; p++) { switch (regmatch (re->re, p, ep - p, firstp && (p == str), &re->lposition)) { case 1: re->position = p; return 1; case -1: re->position = NULL; return -1; } #ifdef JP_CHARSET if (IS_KANJI1(*p)) p++; #endif } return 0; } /* * matchedPosition: last matched position */ void MatchedPosition(Regex *re, char **first, char **last) { *first = re->position; *last = re->lposition; } void matchedPosition(char **first, char **last) { *first = DefaultRegex.position; *last = DefaultRegex.lposition; } /* * Intermal routines */ static int regmatch(regexchar * re, char *str, int len, int firstp, char **lastpos) { char *p = str, *ep = str + len; char *lpos, *llpos = NULL; longchar k; *lastpos = NULL; #ifdef REGEX_DEBUG debugre(re, str); #endif /* REGEX_DEBUG */ while ((re->mode & RE_ENDMARK) == 0) { if (re->mode & RE_BEGIN) { if (!firstp) return 0; re++; } else if (re->mode & RE_ANYTIME) { short matched, ok = 0; for (;;) { matched = 0; if (regmatch(re + 1, p, ep - p, firstp, &lpos) == 1) { llpos = lpos; matched = 1; ok = 1; } if (p >= ep) break; #ifdef JP_CHARSET if (IS_KANJI1(*p)) { k = RE_KANJI(p); if (regmatch1(re, k)) { if (lastpos != NULL) *lastpos = llpos; p += 2; } else break; } else #endif { k = (unsigned char)*p; if (regmatch1(re, k)) { p++; if (lastpos != NULL) *lastpos = llpos; } else break; } } if (lastpos != NULL) *lastpos = llpos; return ok; } else if (re->mode & RE_END) { if (lastpos != NULL) *lastpos = p; return (p >= ep); } else { int a; #ifdef JP_CHARSET if (IS_KANJI1(*p)) { k = RE_KANJI(p); p += 2; a = regmatch1(re, k); } else #endif { k = (unsigned char)*(p++); a = regmatch1(re, k); } if (!a) return 0; else re++; } } if (lastpos != NULL) *lastpos = p; return 1; } static int regmatch1(regexchar * re, longchar c) { switch (re->mode & RE_MATCHMODE) { case RE_ANY: #ifdef REGEX_DEBUG printf("%c vs any. -> 1\n", c); #endif /* REGEX_DEBUG */ return 1; case RE_NORMAL: #ifdef REGEX_DEBUG printf("RE=%c vs %c -> %d\n", *re->pattern, c, *re->pattern == c); #endif /* REGEX_DEBUG */ if (re->mode & RE_IGNCASE) { if (*re->pattern < 127 && c < 127 && IS_ALPHA(*re->pattern) && IS_ALPHA(c)) return tolower(*re->pattern) == tolower(c); else return *re->pattern == c; } else return (*re->pattern == c); case RE_WHICH: return matchWhich(re->pattern, c); case RE_EXCEPT: return !matchWhich(re->pattern, c); } return 0; } static int matchWhich(longchar * pattern, longchar c) { longchar *p = pattern; int ans = 0; #ifdef REGEX_DEBUG printf("RE pattern = %s char=%c", pattern, c); #endif /* REGEX_DEBUG */ while (*p != '\0') { if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') { /* Char * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * class. * * * * * * * * * * * * * * * */ if (*p <= c && c <= *(p + 2)) { ans = 1; break; } p += 3; } else { if (*p == c) { ans = 1; break; } p++; } } #ifdef REGEX_DEBUG printf(" -> %d\n", ans); #endif /* REGEX_DEBUG */ return ans; } #ifdef REGEX_DEBUG char * lc2c(longchar * x) { static char y[100]; int i = 0; while (x[i]) { if (x[i] == RE_WHICH_RANGE) y[i] = '-'; else y[i] = x[i]; i++; } y[i] = '\0'; return y; } void debugre(re, s) regexchar *re; char *s; { for (; !(re->mode & RE_ENDMARK); re++) { if (re->mode & RE_BEGIN) { printf("Begin "); continue; } else if (re->mode & RE_END) { printf("End "); continue; } if (re->mode & RE_ANYTIME) printf("Anytime-"); switch (re->mode & RE_MATCHMODE) { case RE_ANY: printf("Any "); break; case RE_NORMAL: printf("Match-to'%c' ", *re->pattern); break; case RE_WHICH: printf("One-of\"%s\" ", lc2c(re->pattern)); break; case RE_EXCEPT: printf("Other-than\"%s\" ", lc2c(re->pattern)); break; default: printf("Unknown "); } } putchar('\n'); } #endif /* REGEX_DEBUG */