/* $Id: regex.c,v 1.6 2001/11/30 10:10:24 ukai Exp $ */
/*
* regex: Regular expression pattern match library
*
* by A.ITO, December 1989
*/
#ifdef REGEX_DEBUG
#include <sys/types.h>
#include <malloc.h>
#endif /* REGEX_DEBUG */
#include <ctype.h>
#include <gc.h>
#include "fm.h"
#include "regex.h"
#ifdef JP_CHARSET
#define RE_KANJI(p) (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1))
#endif
#define RE_WHICH_RANGE 0xffff
static Regex DefaultRegex;
#define CompiledRegex DefaultRegex.re
#define Cstorage DefaultRegex.storage
static longchar *st_ptr;
static int regmatch(regexchar *, char *, int, int, char **);
static int regmatch1(regexchar *, longchar);
static int matchWhich(longchar *, longchar);
/*
* regexCompile: compile regular expression
*/
char *
regexCompile(char *ex, int igncase)
{
char *msg;
newRegex(ex, igncase, &DefaultRegex, &msg);
return msg;
}
Regex *
newRegex(char *ex, int igncase, Regex *regex, char **msg)
{
char *p;
longchar *r;
regexchar *re = regex->re - 1;
int m;
if (regex == 0)
regex = (Regex *)GC_malloc_atomic(sizeof(Regex));
st_ptr = regex->storage;
for (p = ex; *p != '\0'; p++) {
switch (*p) {
case '.':
re++;
re->pattern = NULL;
re->mode = RE_ANY;
break;
case '$':
re++;
re->pattern = NULL;
re->mode = RE_END;
break;
case '^':
re++;
re->pattern = NULL;
re->mode = RE_BEGIN;
break;
case '*':
if (!(re->mode & RE_ANY) && re->pattern == NULL) {
if (msg)
*msg = "Invalid regular expression";
return NULL;
}
re->mode |= RE_ANYTIME;
break;
case '[':
r = st_ptr;
if (*++p == '^') {
p++;
m = RE_EXCEPT;
}
else
m = RE_WHICH;
while (*p != ']') {
if (*p == '\\') {
*(st_ptr++) = *(p + 1);
p += 2;
}
else if (*p == '-') {
*(st_ptr++) = RE_WHICH_RANGE;
p++;
}
else if (*p == '\0') {
if (msg)
*msg = "Missing ]";
return NULL;
}
#ifdef JP_CHARSET
else if (IS_KANJI1(*p)) {
*(st_ptr++) = RE_KANJI(p);
p += 2;
}
#endif
else
*(st_ptr++) = (unsigned char)*(p++);
}
*(st_ptr++) = '\0';
re++;
re->pattern = r;
re->mode = m;
break;
case '\\':
p++;
default:
re++;
#ifdef JP_CHARSET
if (IS_KANJI1(*p)) {
*(st_ptr) = RE_KANJI(p);
p++;
}
else
#endif
*st_ptr = (unsigned char)*p;
re->pattern = st_ptr;
st_ptr++;
re->mode = RE_NORMAL;
if (igncase)
re->mode |= RE_IGNCASE;
}
if (st_ptr >= &Cstorage[STORAGE_MAX] ||
re >= &CompiledRegex[REGEX_MAX]) {
if (msg)
*msg = "Regular expression too long";
return NULL;
}
}
re++;
re->mode = RE_ENDMARK;
if (msg)
*msg = NULL;
return regex;
}
/*
* regexMatch: match regular expression
*/
int
regexMatch(char *str, int len, int firstp)
{
return RegexMatch(&DefaultRegex, str, len, firstp);
}
int
RegexMatch(Regex *re, char *str, int len, int firstp)
{
char *p, *ep;
if (str == NULL)
return 0;
re->position = NULL;
ep = str + ((len == 0) ? strlen(str) : len);
for (p = str; p < ep; p++) {
switch (regmatch
(re->re, p, ep - p, firstp && (p == str), &re->lposition)) {
case 1:
re->position = p;
return 1;
case -1:
re->position = NULL;
return -1;
}
#ifdef JP_CHARSET
if (IS_KANJI1(*p))
p++;
#endif
}
return 0;
}
/*
* matchedPosition: last matched position
*/
void
MatchedPosition(Regex *re, char **first, char **last)
{
*first = re->position;
*last = re->lposition;
}
void
matchedPosition(char **first, char **last)
{
*first = DefaultRegex.position;
*last = DefaultRegex.lposition;
}
/*
* Intermal routines
*/
static int
regmatch(regexchar * re, char *str, int len, int firstp, char **lastpos)
{
char *p = str, *ep = str + len;
char *lpos, *llpos = NULL;
longchar k;
*lastpos = NULL;
#ifdef REGEX_DEBUG
debugre(re, str);
#endif /* REGEX_DEBUG */
while ((re->mode & RE_ENDMARK) == 0) {
if (re->mode & RE_BEGIN) {
if (!firstp)
return 0;
re++;
}
else if (re->mode & RE_ANYTIME) {
short matched, ok = 0;
for (;;) {
matched = 0;
if (regmatch(re + 1, p, ep - p, firstp, &lpos) == 1) {
llpos = lpos;
matched = 1;
ok = 1;
}
if (p >= ep)
break;
#ifdef JP_CHARSET
if (IS_KANJI1(*p)) {
k = RE_KANJI(p);
if (regmatch1(re, k)) {
if (lastpos != NULL)
*lastpos = llpos;
p += 2;
}
else
break;
}
else
#endif
{
k = (unsigned char)*p;
if (regmatch1(re, k)) {
p++;
if (lastpos != NULL)
*lastpos = llpos;
}
else
break;
}
}
if (lastpos != NULL)
*lastpos = llpos;
return ok;
}
else if (re->mode & RE_END) {
if (lastpos != NULL)
*lastpos = p;
return (p >= ep);
}
else {
int a;
#ifdef JP_CHARSET
if (IS_KANJI1(*p)) {
k = RE_KANJI(p);
p += 2;
a = regmatch1(re, k);
}
else
#endif
{
k = (unsigned char)*(p++);
a = regmatch1(re, k);
}
if (!a)
return 0;
else
re++;
}
}
if (lastpos != NULL)
*lastpos = p;
return 1;
}
static int
regmatch1(regexchar * re, longchar c)
{
switch (re->mode & RE_MATCHMODE) {
case RE_ANY:
#ifdef REGEX_DEBUG
printf("%c vs any. -> 1\n", c);
#endif /* REGEX_DEBUG */
return 1;
case RE_NORMAL:
#ifdef REGEX_DEBUG
printf("RE=%c vs %c -> %d\n", *re->pattern, c, *re->pattern == c);
#endif /* REGEX_DEBUG */
if (re->mode & RE_IGNCASE) {
if (*re->pattern < 127 && c < 127 &&
IS_ALPHA(*re->pattern) && IS_ALPHA(c))
return tolower(*re->pattern) == tolower(c);
else
return *re->pattern == c;
}
else
return (*re->pattern == c);
case RE_WHICH:
return matchWhich(re->pattern, c);
case RE_EXCEPT:
return !matchWhich(re->pattern, c);
}
return 0;
}
static int
matchWhich(longchar * pattern, longchar c)
{
longchar *p = pattern;
int ans = 0;
#ifdef REGEX_DEBUG
printf("RE pattern = %s char=%c", pattern, c);
#endif /* REGEX_DEBUG */
while (*p != '\0') {
if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') { /* Char class. */
if (*p <= c && c <= *(p + 2)) {
ans = 1;
break;
}
p += 3;
}
else {
if (*p == c) {
ans = 1;
break;
}
p++;
}
}
#ifdef REGEX_DEBUG
printf(" -> %d\n", ans);
#endif /* REGEX_DEBUG */
return ans;
}
#ifdef REGEX_DEBUG
char *
lc2c(longchar * x)
{
static char y[100];
int i = 0;
while (x[i]) {
if (x[i] == RE_WHICH_RANGE)
y[i] = '-';
else
y[i] = x[i];
i++;
}
y[i] = '\0';
return y;
}
void
debugre(re, s)
regexchar *re;
char *s;
{
for (; !(re->mode & RE_ENDMARK); re++) {
if (re->mode & RE_BEGIN) {
printf("Begin ");
continue;
}
else if (re->mode & RE_END) {
printf("End ");
continue;
}
if (re->mode & RE_ANYTIME)
printf("Anytime-");
switch (re->mode & RE_MATCHMODE) {
case RE_ANY:
printf("Any ");
break;
case RE_NORMAL:
printf("Match-to'%c' ", *re->pattern);
break;
case RE_WHICH:
printf("One-of\"%s\" ", lc2c(re->pattern));
break;
case RE_EXCEPT:
printf("Other-than\"%s\" ", lc2c(re->pattern));
break;
default:
printf("Unknown ");
}
}
putchar('\n');
}
#endif /* REGEX_DEBUG */