aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFumitoshi UKAI <ukai@debian.or.jp>2002-01-10 04:55:06 +0000
committerFumitoshi UKAI <ukai@debian.or.jp>2002-01-10 04:55:06 +0000
commit31d84e0083f0ee9ef6fea93ac85e537cf1688ca7 (patch)
tree54a0d08accf6728d193902418db7a2bfec23859f
parent[w3m-dev 02810] (diff)
downloadw3m-31d84e0083f0ee9ef6fea93ac85e537cf1688ca7.tar.gz
w3m-31d84e0083f0ee9ef6fea93ac85e537cf1688ca7.zip
[w3m-dev 02811] new regexp implementation
From: aito@fw.ipsj.or.jp
Diffstat (limited to '')
-rw-r--r--ChangeLog25
-rw-r--r--NEWS5
-rw-r--r--regex.c625
-rw-r--r--regex.h35
4 files changed, 532 insertions, 158 deletions
diff --git a/ChangeLog b/ChangeLog
index dbbfc44..2996259 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+2002-01-10 aito@fw.ipsj.or.jp
+
+ * [w3m-dev 02811] new regexp implementation
+ * regex.h: remove defines
+ * regex.h: struct regexchar, struct regex
+ * regex.c: include stdio.h, stdlib.h, string.h, config.h
+ * regex.c: remove include fm.h
+ * regex.c (NULL JP_CHARSET IS_ALPHA IS_KANJI1): define unless defined
+ * regex.c (RE_*) define
+ * regex.c (st_ptr): deleted
+ * regex.c (regmatch): remove 4th arg `int'
+ * regex.c (newRegex0): rewrite, based on old newRegex
+ * regex.c (newRegex): use newRegex0()
+ * regex.c (RegexMatch): rewrite
+ * regex.c (struct MatchingContext1): added
+ * regex.c (struct MatchingContext2): added
+ * regex.c (YIELD): added
+ * regex.c (regmatch_sub_anytime): added
+ * regex.c (regmatch_iter): added
+ * regex.c (regmatch): use regmatch_iter()
+ * regex.c (regmatch1): rewrite
+ * regex.c (lc2c): add 2nd arg `len', rewrite
+
2002-01-10 Yoshinobu Sakane <sakane@d4.bsd.nes.nec.co.jp>
* [w3m-dev 02810]
@@ -1795,4 +1818,4 @@
* release-0-2-1
* import w3m-0.2.1
-$Id: ChangeLog,v 1.202 2002/01/10 03:45:13 ukai Exp $
+$Id: ChangeLog,v 1.203 2002/01/10 04:55:06 ukai Exp $
diff --git a/NEWS b/NEWS
index e73b83e..7faf9bb 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,8 @@
+w3m 0.3?
+
+* new regex implementation
+
+----------------------------------------------------------------
w3m 0.2.4 - 2002-01-07
* RFC2818 server identity check
diff --git a/regex.c b/regex.c
index 035483a..04bf302 100644
--- a/regex.c
+++ b/regex.c
@@ -1,18 +1,60 @@
-/* $Id: regex.c,v 1.6 2001/11/30 10:10:24 ukai Exp $ */
+/* $Id: regex.c,v 1.7 2002/01/10 04:55:07 ukai Exp $ */
/*
* regex: Regular expression pattern match library
*
* by A.ITO, December 1989
+ * Revised by A.ITO, January 2002
*/
#ifdef REGEX_DEBUG
#include <sys/types.h>
#include <malloc.h>
#endif /* REGEX_DEBUG */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include <ctype.h>
#include <gc.h>
-#include "fm.h"
#include "regex.h"
+#include "config.h"
+
+#ifndef NULL
+#define NULL 0
+#endif /* not NULL */
+
+#if LANG == JA
+#define JP_CHARSET
+#endif
+
+#define RE_ITER_LIMIT 65535
+
+#define RE_MATCHMODE 0x07
+#define RE_NORMAL 0x00
+#define RE_ANY 0x01
+#define RE_WHICH 0x02
+#define RE_EXCEPT 0x03
+#define RE_SUBREGEX 0x04
+#define RE_BEGIN 0x05
+#define RE_END 0x06
+#define RE_ENDMARK 0x07
+
+#define RE_OPT 0x08
+#define RE_ANYTIME 0x10
+#define RE_IGNCASE 0x40
+
+#define RE_MODE(x) ((x)->mode&RE_MATCHMODE)
+#define RE_SET_MODE(x,v) ((x)->mode = (((x)->mode&~RE_MATCHMODE)|((v)&RE_MATCHMODE)))
+
+#ifdef REGEX_DEBUG
+void debugre(regexchar *);
+char *lc2c(longchar *, int);
+int verbose;
+#endif /* REGEX_DEBUG */
+
+#ifndef IS_ALPHA
+#define IS_ALPHA(x) (!((x)&0x80) && isalpha(x))
+#define IS_KANJI1(x) ((x)&0x80)
+#endif
#ifdef JP_CHARSET
#define RE_KANJI(p) (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1))
@@ -24,13 +66,10 @@ static Regex DefaultRegex;
#define CompiledRegex DefaultRegex.re
#define Cstorage DefaultRegex.storage
-static longchar *st_ptr;
-
-static int regmatch(regexchar *, char *, int, int, char **);
+static int regmatch(regexchar *, char *, int, char **);
static int regmatch1(regexchar *, longchar);
static int matchWhich(longchar *, longchar);
-
/*
* regexCompile: compile regular expression
*/
@@ -42,41 +81,66 @@ regexCompile(char *ex, int igncase)
return msg;
}
-Regex *
-newRegex(char *ex, int igncase, Regex *regex, char **msg)
+static Regex *
+newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level)
{
char *p;
longchar *r;
- regexchar *re = regex->re - 1;
+ regexchar *re;
int m;
+ longchar *st_ptr;
- if (regex == 0)
- regex = (Regex *)GC_malloc_atomic(sizeof(Regex));
+ if (regex == NULL)
+ regex = (Regex *)GC_malloc(sizeof(Regex));
+ regex->alt_regex = NULL;
+ re = regex->re;
st_ptr = regex->storage;
- for (p = ex; *p != '\0'; p++) {
+ for (p = *ex; *p != '\0'; p++) {
+ re->mode = 0;
switch (*p) {
case '.':
+ re->p.pattern = NULL;
+ RE_SET_MODE(re, RE_ANY);
re++;
- re->pattern = NULL;
- re->mode = RE_ANY;
break;
case '$':
+ re->p.pattern = NULL;
+ RE_SET_MODE(re, RE_END);
re++;
- re->pattern = NULL;
- re->mode = RE_END;
break;
case '^':
+ re->p.pattern = NULL;
+ RE_SET_MODE(re, RE_BEGIN);
re++;
- re->pattern = NULL;
- re->mode = RE_BEGIN;
break;
- case '*':
- if (!(re->mode & RE_ANY) && re->pattern == NULL) {
+ case '+':
+ if (re == regex->re ||
+ (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
if (msg)
*msg = "Invalid regular expression";
return NULL;
}
+ *re = *(re - 1);
re->mode |= RE_ANYTIME;
+ re++;
+ break;
+ case '*':
+ if (re == regex->re ||
+ (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
+ if (msg)
+ *msg = "Invalid regular expression";
+ return NULL;
+ }
+ (re - 1)->mode |= RE_ANYTIME;
+ break;
+ case '?':
+ if (re == regex->re ||
+ (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
+ if (msg)
+ *msg = "Invalid regular expression";
+ return NULL;
+ }
+ (re - 1)->mode |= RE_OPT;
break;
case '[':
r = st_ptr;
@@ -110,14 +174,40 @@ newRegex(char *ex, int igncase, Regex *regex, char **msg)
*(st_ptr++) = (unsigned char)*(p++);
}
*(st_ptr++) = '\0';
+ re->p.pattern = r;
+ RE_SET_MODE(re, m);
+ re++;
+ break;
+ case '|':
+ RE_SET_MODE(re, RE_ENDMARK);
+ re++;
+ p++;
+ regex->alt_regex = newRegex0(&p, igncase, NULL, msg, level);
+ if (regex->alt_regex == NULL)
+ return NULL;
+ *ex = p;
+ return regex;
+ case '(':
+ RE_SET_MODE(re, RE_SUBREGEX);
+ p++;
+ re->p.sub = newRegex0(&p, igncase, NULL, msg, level + 1);
+ if (re->p.sub == NULL)
+ return NULL;
re++;
- re->pattern = r;
- re->mode = m;
break;
+ case ')':
+ if (level == 0) {
+ if (msg)
+ *msg = "Too many ')'";
+ return NULL;
+ }
+ RE_SET_MODE(re, RE_ENDMARK);
+ re++;
+ *ex = p;
+ return regex;
case '\\':
p++;
default:
- re++;
#ifdef JP_CHARSET
if (IS_KANJI1(*p)) {
*(st_ptr) = RE_KANJI(p);
@@ -126,26 +216,33 @@ newRegex(char *ex, int igncase, Regex *regex, char **msg)
else
#endif
*st_ptr = (unsigned char)*p;
- re->pattern = st_ptr;
+ re->p.pattern = st_ptr;
st_ptr++;
- re->mode = RE_NORMAL;
+ RE_SET_MODE(re, RE_NORMAL);
if (igncase)
re->mode |= RE_IGNCASE;
+ re++;
}
- if (st_ptr >= &Cstorage[STORAGE_MAX] ||
- re >= &CompiledRegex[REGEX_MAX]) {
+ if (st_ptr >= &regex->storage[STORAGE_MAX] ||
+ re >= &regex->re[REGEX_MAX]) {
if (msg)
*msg = "Regular expression too long";
return NULL;
}
}
- re++;
- re->mode = RE_ENDMARK;
+ RE_SET_MODE(re, RE_ENDMARK);
if (msg)
*msg = NULL;
+ *ex = p;
return regex;
}
+Regex *
+newRegex(char *ex, int igncase, Regex *regex, char **msg)
+{
+ return newRegex0(&ex, igncase, regex, msg, 0);
+}
+
/*
* regexMatch: match regular expression
*/
@@ -159,20 +256,33 @@ int
RegexMatch(Regex *re, char *str, int len, int firstp)
{
char *p, *ep;
+ char *lpos;
+ Regex *r;
if (str == NULL)
return 0;
+ if (len == 0)
+ len = strlen(str);
re->position = NULL;
- ep = str + ((len == 0) ? strlen(str) : len);
+ ep = str + len;
for (p = str; p < ep; p++) {
- switch (regmatch
- (re->re, p, ep - p, firstp && (p == str), &re->lposition)) {
- case 1:
- re->position = p;
+ lpos = NULL;
+ re->lposition = NULL;
+ for (r = re; r != NULL; r = r->alt_regex) {
+ switch (regmatch(r->re, p, firstp && (p == str), &lpos)) {
+ case 1: /* matched */
+ re->position = p;
+ if (re->lposition == NULL || re->lposition < lpos)
+ re->lposition = lpos;
+ break;
+ case -1: /* error */
+ re->position = NULL;
+ return -1;
+ }
+ }
+ if (re->lposition != NULL) {
+ /* matched */
return 1;
- case -1:
- re->position = NULL;
- return -1;
}
#ifdef JP_CHARSET
if (IS_KANJI1(*p))
@@ -202,118 +312,306 @@ matchedPosition(char **first, char **last)
/*
* Intermal routines
*/
+
+struct MatchingContext1 {
+ int label;
+ regexchar *re;
+ char *lastpos;
+ char *str;
+ int iter_limit;
+ int n_any;
+ int firstp;
+ char *end_p;
+ Regex *sub_regex;
+ struct MatchingContext1 *sub_ctx;
+ struct MatchingContext2 *ctx2;
+};
+
+struct MatchingContext2 {
+ int label;
+ Regex *regex;
+ char *lastpos;
+ struct MatchingContext1 *ctx;
+ struct MatchingContext2 *ctx2;
+ char *str;
+ int n_any;
+ int firstp;
+};
+
+
+#define YIELD(retval,context,lnum) (context)->label = lnum; return (retval); label##lnum:
+
+static int regmatch_iter(struct MatchingContext1 *, regexchar *, char *, int);
+
static int
-regmatch(regexchar * re, char *str, int len, int firstp, char **lastpos)
+regmatch_sub_anytime(struct MatchingContext2 *c, Regex *regex,
+ regexchar * pat2, char *str, int iter_limit, int firstp)
{
- char *p = str, *ep = str + len;
- char *lpos, *llpos = NULL;
- longchar k;
+ switch (c->label) {
+ case 1:
+ goto label1;
+ case 2:
+ goto label2;
+ case 3:
+ goto label3;
+ }
+ c->ctx = GC_malloc(sizeof(struct MatchingContext1));
+ c->ctx2 = GC_malloc(sizeof(struct MatchingContext2));
+ c->ctx->label = 0;
+ c->regex = regex;
+ c->n_any = 0;
+ c->str = str;
+ c->firstp = firstp;
+ for (;;) {
+ c->ctx->label = 0;
+ while (regmatch_iter(c->ctx, c->regex->re, c->str, c->firstp)) {
+ c->n_any = c->ctx->lastpos - c->str;
+ if (c->n_any <= 0)
+ continue;
+ c->firstp = 0;
+ if (RE_MODE(pat2) == RE_ENDMARK) {
+ c->lastpos = c->str + c->n_any;
+ YIELD(1, c, 1);
+ }
+ else if (regmatch(pat2, c->str + c->n_any,
+ c->firstp, &c->lastpos) == 1) {
+ YIELD(1, c, 2);
+ }
+ if (iter_limit == 1)
+ continue;
+ c->ctx2->label = 0;
+ while (regmatch_sub_anytime(c->ctx2, regex, pat2,
+ c->str + c->n_any, iter_limit - 1,
+ c->firstp)) {
- *lastpos = NULL;
-#ifdef REGEX_DEBUG
- debugre(re, str);
-#endif /* REGEX_DEBUG */
- while ((re->mode & RE_ENDMARK) == 0) {
- if (re->mode & RE_BEGIN) {
- if (!firstp)
- return 0;
- re++;
+ c->lastpos = c->ctx2->lastpos;
+ YIELD(1, c, 3);
+ }
}
- else if (re->mode & RE_ANYTIME) {
- short matched, ok = 0;
- for (;;) {
- matched = 0;
- if (regmatch(re + 1, p, ep - p, firstp, &lpos) == 1) {
- llpos = lpos;
- matched = 1;
- ok = 1;
+ if (c->regex->alt_regex == NULL)
+ break;
+ c->regex = c->regex->alt_regex;
+ }
+ return 0;
+}
+
+static int
+regmatch_iter(struct MatchingContext1 *c,
+ regexchar * re, char *str, int firstp)
+{
+ switch (c->label) {
+ case 1:
+ goto label1;
+ case 2:
+ goto label2;
+ case 3:
+ goto label3;
+ case 4:
+ goto label4;
+ case 5:
+ goto label5;
+ case 6:
+ goto label6;
+ case 7:
+ goto label7;
+ }
+ if (RE_MODE(re) == RE_ENDMARK)
+ return 0;
+ c->re = re;
+ c->end_p = str + strlen(str);
+ c->firstp = firstp;
+ c->str = str;
+ c->sub_ctx = NULL;
+ while (RE_MODE(c->re) != RE_ENDMARK) {
+ if (c->re->mode & (RE_ANYTIME | RE_OPT)) {
+ if (c->re->mode & RE_ANYTIME)
+ c->iter_limit = RE_ITER_LIMIT;
+ else
+ c->iter_limit = 1;
+ c->n_any = -1;
+ while (c->n_any < c->iter_limit) {
+ if (c->str + c->n_any >= c->end_p) {
+ return 0;
}
- if (p >= ep)
- break;
+ if (c->n_any >= 0) {
+ if (RE_MODE(c->re) == RE_SUBREGEX) {
+ c->ctx2 = GC_malloc(sizeof(struct MatchingContext2));
+ c->ctx2->label = 0;
+ while (regmatch_sub_anytime(c->ctx2,
+ c->re->p.sub,
+ c->re + 1,
+ c->str + c->n_any,
+ c->iter_limit,
+ c->firstp)) {
+ c->n_any = c->ctx2->lastpos - c->str;
+ c->lastpos = c->ctx2->lastpos;
+ YIELD(1, c, 1);
+ }
+ return 0;
+ }
#ifdef JP_CHARSET
- if (IS_KANJI1(*p)) {
- k = RE_KANJI(p);
- if (regmatch1(re, k)) {
- if (lastpos != NULL)
- *lastpos = llpos;
- p += 2;
+ else if (IS_KANJI1(c->str[c->n_any])) {
+ longchar k;
+ k = RE_KANJI(c->str + c->n_any);
+ if (regmatch1(c->re, k)) {
+ c->n_any += 2;
+ }
+ else {
+ return 0;
+ }
+ c->firstp = 0;
}
- else
- break;
- }
- else
#endif
- {
- k = (unsigned char)*p;
- if (regmatch1(re, k)) {
- p++;
- if (lastpos != NULL)
- *lastpos = llpos;
+ else {
+ longchar k;
+ k = (unsigned char)c->str[c->n_any];
+ if (regmatch1(c->re, k)) {
+ c->n_any++;
+ }
+ else {
+ return 0;
+ }
+ c->firstp = 0;
}
- else
- break;
+ }
+ else
+ c->n_any++;
+ if (RE_MODE(c->re + 1) == RE_ENDMARK) {
+ c->lastpos = c->str + c->n_any;
+ YIELD(1, c, 2);
+ }
+ else if (regmatch(c->re + 1, c->str + c->n_any,
+ c->firstp, &c->lastpos) == 1) {
+ YIELD(1, c, 3);
}
}
- if (lastpos != NULL)
- *lastpos = llpos;
- return ok;
- }
- else if (re->mode & RE_END) {
- if (lastpos != NULL)
- *lastpos = p;
- return (p >= ep);
+ return 0;
}
- else {
- int a;
+ /* regexp other than pat*, pat+ and pat? */
+ if (c->str >= c->end_p)
+ return 0;
+ switch (RE_MODE(c->re)) {
+ case RE_BEGIN:
+ if (!c->firstp)
+ return 0;
+ c->re++;
+ break;
+ case RE_END:
+ c->lastpos = c->str;
+ c->re++;
+ YIELD((c->str >= c->end_p), c, 4);
+ break;
+ case RE_SUBREGEX:
+ if (c->sub_ctx == NULL) {
+ c->sub_ctx = GC_malloc(sizeof(struct MatchingContext1));
+ }
+ c->sub_regex = c->re->p.sub;
+ for (;;) {
+ c->sub_ctx->label = 0;
+ while (regmatch_iter(c->sub_ctx, c->sub_regex->re,
+ c->str, c->firstp)) {
+ if (c->sub_ctx->lastpos != c->str)
+ c->firstp = 0;
+ if (RE_MODE(c->re + 1) == RE_ENDMARK) {
+ c->lastpos = c->sub_ctx->lastpos;
+ YIELD(1, c, 5);
+ }
+ else if (regmatch(c->re + 1, c->sub_ctx->lastpos,
+ c->firstp, &c->lastpos) == 1) {
+ YIELD(1, c, 6);
+ }
+ }
+ if (c->sub_regex->alt_regex == NULL)
+ break;
+ c->sub_regex = c->sub_regex->alt_regex;
+ }
+ return 0;
+ default:
#ifdef JP_CHARSET
- if (IS_KANJI1(*p)) {
- k = RE_KANJI(p);
- p += 2;
- a = regmatch1(re, k);
+ if (IS_KANJI1(*c->str)) {
+ longchar k;
+ k = RE_KANJI(c->str);
+ c->str += 2;
+ if (!regmatch1(c->re, k))
+ return 0;
}
else
#endif
{
- k = (unsigned char)*(p++);
- a = regmatch1(re, k);
+ longchar k;
+ k = (unsigned char)*(c->str++);
+ if (!regmatch1(c->re, k))
+ return 0;
}
- if (!a)
- return 0;
- else
- re++;
+ c->re++;
+ c->firstp = 0;
+ }
+ }
+ c->lastpos = c->str;
+#ifdef REGEX_DEBUG
+ if (verbose)
+ printf("Succeed: %s %d\n", c->str, c->lastpos - c->str);
+#endif
+ YIELD(1, c, 7);
+ return 0;
+}
+
+static int
+regmatch(regexchar * re, char *str, int firstp, char **lastpos)
+{
+ struct MatchingContext1 contx;
+
+ *lastpos = NULL;
+
+ contx.label = 0;
+ while (regmatch_iter(&contx, re, str, firstp)) {
+#ifdef REGEX_DEBUG
+ char *p;
+ if (verbose) {
+ printf("regmatch: matched <");
+ for (p = str; p < contx.lastpos; p++)
+ putchar(*p);
+ printf(">\n");
}
+#endif
+ if (*lastpos == NULL || *lastpos < contx.lastpos)
+ *lastpos = contx.lastpos;
}
- if (lastpos != NULL)
- *lastpos = p;
+ if (*lastpos == NULL)
+ return 0;
return 1;
}
+
static int
regmatch1(regexchar * re, longchar c)
{
- switch (re->mode & RE_MATCHMODE) {
+ switch (RE_MODE(re)) {
case RE_ANY:
#ifdef REGEX_DEBUG
- printf("%c vs any. -> 1\n", c);
+ if (verbose)
+ printf("%c vs any. -> 1\n", c);
#endif /* REGEX_DEBUG */
return 1;
case RE_NORMAL:
#ifdef REGEX_DEBUG
- printf("RE=%c vs %c -> %d\n", *re->pattern, c, *re->pattern == c);
+ if (verbose)
+ printf("RE=%c vs %c -> %d\n", *re->p.pattern, c,
+ *re->p.pattern == c);
#endif /* REGEX_DEBUG */
if (re->mode & RE_IGNCASE) {
- if (*re->pattern < 127 && c < 127 &&
- IS_ALPHA(*re->pattern) && IS_ALPHA(c))
- return tolower(*re->pattern) == tolower(c);
+ if (*re->p.pattern < 127 && c < 127 &&
+ IS_ALPHA(*re->p.pattern) && IS_ALPHA(c))
+ return tolower(*re->p.pattern) == tolower(c);
else
- return *re->pattern == c;
+ return *re->p.pattern == c;
}
else
- return (*re->pattern == c);
+ return (*re->p.pattern == c);
case RE_WHICH:
- return matchWhich(re->pattern, c);
+ return matchWhich(re->p.pattern, c);
case RE_EXCEPT:
- return !matchWhich(re->pattern, c);
+ return !matchWhich(re->p.pattern, c);
}
return 0;
}
@@ -325,7 +623,8 @@ matchWhich(longchar * pattern, longchar c)
int ans = 0;
#ifdef REGEX_DEBUG
- printf("RE pattern = %s char=%c", pattern, c);
+ if (verbose)
+ printf("RE pattern = %s char=%s", lc2c(pattern, 10000), lc2c(&c, 1));
#endif /* REGEX_DEBUG */
while (*p != '\0') {
if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') { /* Char class. */
@@ -344,64 +643,130 @@ matchWhich(longchar * pattern, longchar c)
}
}
#ifdef REGEX_DEBUG
- printf(" -> %d\n", ans);
+ if (verbose)
+ printf(" -> %d\n", ans);
#endif /* REGEX_DEBUG */
return ans;
}
#ifdef REGEX_DEBUG
char *
-lc2c(longchar * x)
+lc2c(longchar * x, int len)
{
static char y[100];
int i = 0;
+ char *r;
- while (x[i]) {
+ while (x[i] && i < len) {
if (x[i] == RE_WHICH_RANGE)
- y[i] = '-';
+ y[i++] = '-';
+ else if (x[i] >= 128) {
+ y[i++] = ((x[i] >> 8) & 0xff);
+ y[i++] = (x[i] & 0xff);
+ }
else
- y[i] = x[i];
- i++;
+ y[i++] = x[i];
}
y[i] = '\0';
- return y;
+ r = GC_malloc_atomic(i + 1);
+ strcpy(r, y);
+ return r;
}
void
-debugre(re, s)
- regexchar *re;
- char *s;
+debugre(regexchar * re)
{
- for (; !(re->mode & RE_ENDMARK); re++) {
- if (re->mode & RE_BEGIN) {
+ for (; RE_MODE(re) != RE_ENDMARK; re++) {
+ switch (RE_MODE(re)) {
+ case RE_BEGIN:
printf("Begin ");
continue;
- }
- else if (re->mode & RE_END) {
+ case RE_END:
printf("End ");
continue;
}
if (re->mode & RE_ANYTIME)
printf("Anytime-");
+ if (re->mode & RE_OPT)
+ printf("Opt-");
- switch (re->mode & RE_MATCHMODE) {
+ switch (RE_MODE(re)) {
case RE_ANY:
printf("Any ");
break;
case RE_NORMAL:
- printf("Match-to'%c' ", *re->pattern);
+ printf("Match-to'%c' ", *re->p.pattern);
break;
case RE_WHICH:
- printf("One-of\"%s\" ", lc2c(re->pattern));
+ printf("One-of\"%s\" ", lc2c(re->p.pattern, 10000));
break;
case RE_EXCEPT:
- printf("Other-than\"%s\" ", lc2c(re->pattern));
+ printf("Other-than\"%s\" ", lc2c(re->p.pattern, 10000));
break;
+ case RE_SUBREGEX:
+ {
+ Regex *r = re->p.sub;
+ printf("(");
+ while (r) {
+ debugre(r->re);
+ if (r->alt_regex)
+ printf(" | ");
+ r = r->alt_regex;
+ }
+ printf(")");
+ break;
+ }
default:
printf("Unknown ");
}
}
- putchar('\n');
}
#endif /* REGEX_DEBUG */
+
+#ifdef REGEXTEST
+int
+main(int argc, char **argv)
+{
+ char buf[128], buf2[128];
+ char *msg;
+ Regex *re;
+ char *fpos, *epos;
+ FILE *f = stdin;
+ int i = 1;
+
+#ifdef REGEX_DEBUG
+ for (i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "-v") == 0)
+ verbose = 1;
+ else
+ break;
+ }
+#endif
+
+ if (argc > i)
+ f = fopen(argv[i], "r");
+ if (f == NULL) {
+ fprintf(stderr, "Can't open %s\n", argv[i]);
+ exit(1);
+ }
+ while (fscanf(f, "%s%s", buf, buf2) == 2) {
+ re = newRegex(buf, 0, NULL, &msg);
+ if (re == NULL) {
+ printf("Error on regexp /%s/: %s\n", buf, msg);
+ exit(1);
+ }
+ if (RegexMatch(re, buf2, 0, 1)) {
+ printf("/%s/\t%s\t", buf, buf2);
+ MatchedPosition(re, &fpos, &epos);
+ while (fpos < epos)
+ putchar(*(fpos++));
+ }
+ else
+ printf("/%s/\t%s\tno_match", buf, buf2);
+ putchar('\n');
+ }
+ /* notreatched */
+ return 0;
+}
+#endif
diff --git a/regex.h b/regex.h
index 5a8d986..46442b4 100644
--- a/regex.h
+++ b/regex.h
@@ -1,44 +1,25 @@
-/* $Id: regex.h,v 1.3 2001/11/24 02:01:26 ukai Exp $ */
+/* $Id: regex.h,v 1.4 2002/01/10 04:55:07 ukai Exp $ */
#define REGEX_MAX 64
#define STORAGE_MAX 256
-#ifndef NULL
-#define NULL 0
-#endif /* not NULL */
-
-#define RE_NORMAL 0
-#define RE_MATCHMODE 0x07
-#define RE_ANY 0x01
-#define RE_WHICH 0x02
-#define RE_EXCEPT 0x04
-#define RE_ANYTIME 0x08
-#define RE_BEGIN 0x10
-#define RE_END 0x20
-#define RE_IGNCASE 0x40
-#define RE_ENDMARK 0x80
typedef unsigned short longchar;
-
-typedef struct {
-
- longchar *pattern;
-
+typedef struct regexchar {
+ union {
+ longchar *pattern;
+ struct regex *sub;
+ } p;
unsigned char mode;
-
} regexchar;
-typedef struct {
-
+typedef struct regex {
regexchar re[REGEX_MAX];
-
longchar storage[STORAGE_MAX];
-
char *position;
-
char *lposition;
-
+ struct regex *alt_regex;
} Regex;