From ec075380c7739d66a5a150cff635805d369dedfb Mon Sep 17 00:00:00 2001 From: Justin Seyster Date: Thu, 17 Feb 2011 15:54:22 -0500 Subject: [PATCH] Changes to NFA code. - New lexer thats treat multi-letter words as one symbol. - Ability to create multiple NFAs and to simultaneously track multiple inputs on each. - Header file providing an external interface. --- src/nfa.c | 274 +++++++++++++++++++++++++++++++++++++----------------- src/nfa.h | 35 +++++++ 2 files changed, 226 insertions(+), 83 deletions(-) create mode 100644 src/nfa.h diff --git a/src/nfa.c b/src/nfa.c index 0c5826c..5f7b9cf 100644 --- a/src/nfa.c +++ b/src/nfa.c @@ -40,15 +40,94 @@ #include #include +static int +is_whitespace(char c) +{ + return (c == ' ' || c == '\t' || c== '\n'); +} + +static int +is_operator(char c) +{ + return (c == '(' || c == '|' || c==')' || c=='*' || c=='+' || c=='?'); +} + +/* _0-9a-zA-Z */ +static int +is_symbol_char(char c) +{ + return (c == '_' || (c >= '0' && c <='9') || (c >= 'a' && c <='z') + || (c >= 'A' && c <='Z')); +} + +/* Writing this regexp tokenizer would be a whole lot easier if I + * could just _use_ regexps to do it. Sigh. + * + * Returns (, |, ), *, +, or ? if the parsed token is one of those + * operators; ! if the tokenizer encounters an error; '\0' if there is + * nothing left to parse; and some other value if the token is an + * indentifier. + * + * The parsed token is stored in token and its length in len. + **/ +static char +pop_token(const char** re, const char** token, int* len) +{ + while(**re != '\0' && is_whitespace(**re)) + (*re)++; + + if (**re == '\0'){ + *len = 0; + return '\0'; + } + else if(is_operator(**re)){ + *token = *re; + *len = 1; + (*re)++; + + return **token; + } + + if (!is_symbol_char(**re)){ + *len = 0; + return '!'; /* ! for error. */ + } + + *token = *re; + for (*len = 0; is_symbol_char(**re); (*re)++, (*len)++) + ; + + return **token; +} + +/* Find the index of token (which has length len and need not be + * null-terminated) in the list of names. */ +static int +lookup_name(const char* token, int len, const char** names, int num_names) +{ + int i; + + for(i = 0; i < num_names; i++){ + if(strncmp(token, names[i], len) == 0) + return i; + } + + return -1; +} + /* * Convert infix regexp re to postfix notation. * Insert . as explicit concatenation operator. * Cheesy parser, return static buffer. */ -char* -re2post(char *re) +static char* +re2post(const char* re, const char** names, int num_names) { int nalt, natom; + char c; + const char *token; + int len; + int sym; static char buf[8000]; char *dst; struct { @@ -62,8 +141,10 @@ re2post(char *re) natom = 0; if(strlen(re) >= sizeof buf/2) return NULL; - for(; *re; re++){ - switch(*re){ + for(c = pop_token(&re, &token, &len); c; c = pop_token(&re, &token, &len)){ + switch(c){ + case '!': + return NULL; case '(': if(natom > 1){ --natom; @@ -103,14 +184,18 @@ re2post(char *re) case '?': if(natom == 0) return NULL; - *dst++ = *re; + *dst++ = c; break; default: if(natom > 1){ --natom; *dst++ = '.'; } - *dst++ = *re; + sym = lookup_name(token, len, names, num_names); + if(sym < 0){ + return NULL; + } + *dst++ = (char)(sym + 'A'); natom++; break; } @@ -142,10 +227,15 @@ struct State int c; State *out; State *out1; - int lastlist; }; -State matchstate = { Match }; /* matching state */ -int nstate; +static State matchstate = { Match }; /* matching state */ + +typedef struct NFA NFA; +struct NFA +{ + State* start; + int num_states; +}; /* Allocate and initialize State */ State* @@ -153,9 +243,7 @@ state(int c, State *out, State *out1) { State *s; - nstate++; s = malloc(sizeof *s); - s->lastlist = 0; s->c = c; s->out = out; s->out1 = out1; @@ -177,7 +265,7 @@ struct Frag }; /* Initialize Frag struct. */ -Frag +static Frag frag(State *start, Ptrlist *out) { Frag n = { start, out }; @@ -196,7 +284,7 @@ union Ptrlist }; /* Create singleton list containing just outp. */ -Ptrlist* +static Ptrlist* list1(State **outp) { Ptrlist *l; @@ -207,7 +295,7 @@ list1(State **outp) } /* Patch the list of states at out to point to start. */ -void +static void patch(Ptrlist *l, State *s) { Ptrlist *next; @@ -219,7 +307,7 @@ patch(Ptrlist *l, State *s) } /* Join the two lists l1 and l2, returning the combination. */ -Ptrlist* +static Ptrlist* append(Ptrlist *l1, Ptrlist *l2) { Ptrlist *oldl1; @@ -233,20 +321,25 @@ append(Ptrlist *l1, Ptrlist *l2) /* * Convert postfix regular expression to NFA. - * Return start state. */ -State* +static NFA* post2nfa(char *postfix) { char *p; Frag stack[1000], *stackp, e1, e2, e; State *s; + NFA *nfa; // fprintf(stderr, "postfix: %s\n", postfix); if(postfix == NULL) return NULL; + nfa = malloc(sizeof(NFA)); + if (nfa == NULL) + return NULL; + nfa->num_states = 1; /* The final (matching) state. */ + #define push(s) *stackp++ = s #define pop() *--stackp @@ -255,6 +348,7 @@ post2nfa(char *postfix) switch(*p){ default: s = state(*p, NULL, NULL); + nfa->num_states++; push(frag(s, list1(&s->out))); break; case '.': /* catenate */ @@ -267,22 +361,26 @@ post2nfa(char *postfix) e2 = pop(); e1 = pop(); s = state(Split, e1.start, e2.start); + nfa->num_states++; push(frag(s, append(e1.out, e2.out))); break; case '?': /* zero or one */ e = pop(); s = state(Split, e.start, NULL); + nfa->num_states++; push(frag(s, append(e.out, list1(&s->out1)))); break; case '*': /* zero or more */ e = pop(); s = state(Split, e.start, NULL); + nfa->num_states++; patch(e.out, s); push(frag(s, list1(&s->out1))); break; case '+': /* one or more */ e = pop(); s = state(Split, e.start, NULL); + nfa->num_states++; patch(e.out, s); push(frag(e.start, list1(&s->out1))); break; @@ -294,7 +392,8 @@ post2nfa(char *postfix) return NULL; patch(e.out, &matchstate); - return e.start; + nfa->start = e.start; + return nfa; #undef pop #undef push } @@ -305,41 +404,38 @@ struct List State **s; int n; }; -List l1, l2; -static int listid; -void addstate(List*, State*); -void step(List*, int, List*); +typedef struct NState NState; +struct NState +{ + struct List l; + int num_states; +}; -/* Compute initial state list */ -List* -startlist(State *start, List *l) +static int +inlist(List *l, State *s) { - l->n = 0; - listid++; - addstate(l, start); - return l; + int i; + for (i = 0; i < l->n; i++) + if (l->s[i] == s) + return 1; + + return 0; } /* Check whether state list contains a match. */ int -ismatch(List *l) +ismatch(NState *ns) { - int i; - - for(i=0; in; i++) - if(l->s[i] == &matchstate) - return 1; - return 0; + return inlist(&ns->l, &matchstate); } /* Add s to l, following unlabeled arrows. */ -void +static void addstate(List *l, State *s) { - if(s == NULL || s->lastlist == listid) + if(s == NULL || inlist(l, s)) return; - s->lastlist = listid; if(s->c == Split){ /* follow unlabeled arrows */ addstate(l, s->out); @@ -355,67 +451,79 @@ addstate(List *l, State *s) * to create next NFA state set nlist. */ void -step(List *clist, int c, List *nlist) +step(NState *ns, int c) { int i; State *s; + List *srclist; + List dstlist; - listid++; - nlist->n = 0; - for(i=0; in; i++){ - s = clist->s[i]; + /* Note that we encode input events as characters starting + * from A. (Yeah, it's kind of a hack.) */ + c += 'A'; + + srclist = &ns->l; + dstlist.n = 0; + dstlist.s = alloca(ns->num_states * sizeof(State*)); + + /* Hack: The first state in the state list is _always_ the + * start state. That way, we match any input that has a + * matching suffix. */ + addstate(&dstlist, srclist->s[0]); + + for(i=0; in; i++){ + s = srclist->s[i]; if(s->c == c) - addstate(nlist, s->out); + addstate(&dstlist, s->out); + } + + /* Copy dstlist back to srclist. */ + srclist->n = dstlist.n; + memcpy(srclist->s, dstlist.s, ns->num_states * sizeof(State*)); +} + +NState* +getstartstate(NFA *nfa) +{ + NState *start = malloc(sizeof(List)); + State **s = malloc(nfa->num_states * sizeof(State*)); + if (start == NULL || s == NULL){ + free(start); + free(s); + return NULL; } + start->l.n = 0; + start->l.s = s; + start->num_states = nfa->num_states; + addstate(&start->l, nfa->start); + return start; } -/* Run NFA to determine whether it matches s. */ -int -match(State *start, char *s) +void +freenstate(NState *ns) { - int i, c; - List *clist, *nlist, *t; - - clist = startlist(start, &l1); - nlist = &l2; - for(; *s; s++){ - c = *s & 0xFF; - step(clist, c, nlist); - t = clist; clist = nlist; nlist = t; /* swap clist, nlist */ + if (ns != NULL){ + free(ns->l.s); + free(ns); } - return ismatch(clist); } -int -main(int argc, char **argv) +NFA* +compile_re(const char* re, const char** names, int num_names) { - int i; char *post; - State *start; + NFA *nfa; - if(argc < 3){ - fprintf(stderr, "usage: nfa regexp string...\n"); - return 1; - } - - post = re2post(argv[1]); - if(post == NULL){ - fprintf(stderr, "bad regexp %s\n", argv[1]); - return 1; + post = re2post(re, names, num_names); + if (post == NULL){ + return NULL; } - start = post2nfa(post); - if(start == NULL){ - fprintf(stderr, "error in post2nfa %s\n", post); - return 1; - } - - l1.s = malloc(nstate*sizeof l1.s[0]); - l2.s = malloc(nstate*sizeof l2.s[0]); - for(i=2; i. */ + +/* The interface exposed to tracecut.c from our modified dfa1.c */ + +struct NFA; +struct NState; + +extern int ismatch (struct NState *ns); +extern void step (struct NState *ns, int c); +extern struct NState *getstartstate(struct NFA *nfa); +extern void freenstate (struct NState *ns); +extern struct NFA *compile_re(const char *re, const char **names, + int num_names); -- 2.34.1