From ec075380c7739d66a5a150cff635805d369dedfb Mon Sep 17 00:00:00 2001
From: Justin Seyster <jseyster@cs.sunysb.edu>
Date: Thu, 17 Feb 2011 15:54:22 -0500
Subject: [PATCH] Changes to NFA code.

- New lexer thats treat multi-letter words as one symbol.

- Ability to create multiple NFAs and to simultaneously track multiple
  inputs on each.

- Header file providing an external interface.
---
 src/nfa.c | 274 +++++++++++++++++++++++++++++++++++++-----------------
 src/nfa.h |  35 +++++++
 2 files changed, 226 insertions(+), 83 deletions(-)
 create mode 100644 src/nfa.h

diff --git a/src/nfa.c b/src/nfa.c
index 0c5826c..5f7b9cf 100644
--- a/src/nfa.c
+++ b/src/nfa.c
@@ -40,15 +40,94 @@
 #include <string.h>
 #include <unistd.h>
 
+static int
+is_whitespace(char c)
+{
+	return (c == ' ' || c == '\t' || c== '\n');
+}
+
+static int
+is_operator(char c)
+{
+	return (c == '(' || c == '|' || c==')' || c=='*' || c=='+' || c=='?');
+}
+
+/* _0-9a-zA-Z */
+static int
+is_symbol_char(char c)
+{
+	return (c == '_' || (c >= '0' && c <='9') || (c >= 'a' && c <='z')
+		|| (c >= 'A' && c <='Z'));
+}
+
+/* Writing this regexp tokenizer would be a whole lot easier if I
+ * could just _use_ regexps to do it.  Sigh.
+ *
+ * Returns (, |, ), *, +, or ? if the parsed token is one of those
+ * operators; ! if the tokenizer encounters an error; '\0' if there is
+ * nothing left to parse; and some other value if the token is an
+ * indentifier.
+ *
+ * The parsed token is stored in token and its length in len.
+ **/
+static char
+pop_token(const char** re, const char** token, int* len)
+{
+	while(**re != '\0' && is_whitespace(**re))
+		(*re)++;
+
+	if (**re == '\0'){
+		*len = 0;
+		return '\0';
+	}
+	else if(is_operator(**re)){
+		*token = *re;
+		*len = 1;
+		(*re)++;
+
+		return **token;
+	}
+
+	if (!is_symbol_char(**re)){
+		*len = 0;
+		return '!';  /* ! for error. */
+	}
+
+	*token = *re;
+	for (*len = 0; is_symbol_char(**re); (*re)++, (*len)++)
+		;
+
+	return **token;
+}
+
+/* Find the index of token (which has length len and need not be
+ * null-terminated) in the list of names. */
+static int
+lookup_name(const char* token, int len, const char** names, int num_names)
+{
+	int i;
+
+	for(i = 0; i < num_names; i++){
+		if(strncmp(token, names[i], len) == 0)
+			return i;
+	}
+
+	return -1;
+}
+
 /*
  * Convert infix regexp re to postfix notation.
  * Insert . as explicit concatenation operator.
  * Cheesy parser, return static buffer.
  */
-char*
-re2post(char *re)
+static char*
+re2post(const char* re, const char** names, int num_names)
 {
 	int nalt, natom;
+	char c;
+	const char *token;
+	int len;
+	int sym;
 	static char buf[8000];
 	char *dst;
 	struct {
@@ -62,8 +141,10 @@ re2post(char *re)
 	natom = 0;
 	if(strlen(re) >= sizeof buf/2)
 		return NULL;
-	for(; *re; re++){
-		switch(*re){
+	for(c = pop_token(&re, &token, &len); c; c = pop_token(&re, &token, &len)){
+		switch(c){
+		case '!':
+			return NULL;
 		case '(':
 			if(natom > 1){
 				--natom;
@@ -103,14 +184,18 @@ re2post(char *re)
 		case '?':
 			if(natom == 0)
 				return NULL;
-			*dst++ = *re;
+			*dst++ = c;
 			break;
 		default:
 			if(natom > 1){
 				--natom;
 				*dst++ = '.';
 			}
-			*dst++ = *re;
+			sym = lookup_name(token, len, names, num_names);
+			if(sym < 0){
+				return NULL;
+			}
+			*dst++ = (char)(sym + 'A');
 			natom++;
 			break;
 		}
@@ -142,10 +227,15 @@ struct State
 	int c;
 	State *out;
 	State *out1;
-	int lastlist;
 };
-State matchstate = { Match };	/* matching state */
-int nstate;
+static State matchstate = { Match };	/* matching state */
+
+typedef struct NFA NFA;
+struct NFA
+{
+	State* start;
+	int num_states;
+};
 
 /* Allocate and initialize State */
 State*
@@ -153,9 +243,7 @@ state(int c, State *out, State *out1)
 {
 	State *s;
 	
-	nstate++;
 	s = malloc(sizeof *s);
-	s->lastlist = 0;
 	s->c = c;
 	s->out = out;
 	s->out1 = out1;
@@ -177,7 +265,7 @@ struct Frag
 };
 
 /* Initialize Frag struct. */
-Frag
+static Frag
 frag(State *start, Ptrlist *out)
 {
 	Frag n = { start, out };
@@ -196,7 +284,7 @@ union Ptrlist
 };
 
 /* Create singleton list containing just outp. */
-Ptrlist*
+static Ptrlist*
 list1(State **outp)
 {
 	Ptrlist *l;
@@ -207,7 +295,7 @@ list1(State **outp)
 }
 
 /* Patch the list of states at out to point to start. */
-void
+static void
 patch(Ptrlist *l, State *s)
 {
 	Ptrlist *next;
@@ -219,7 +307,7 @@ patch(Ptrlist *l, State *s)
 }
 
 /* Join the two lists l1 and l2, returning the combination. */
-Ptrlist*
+static Ptrlist*
 append(Ptrlist *l1, Ptrlist *l2)
 {
 	Ptrlist *oldl1;
@@ -233,20 +321,25 @@ append(Ptrlist *l1, Ptrlist *l2)
 
 /*
  * Convert postfix regular expression to NFA.
- * Return start state.
  */
-State*
+static NFA*
 post2nfa(char *postfix)
 {
 	char *p;
 	Frag stack[1000], *stackp, e1, e2, e;
 	State *s;
+	NFA *nfa;
 	
 	// fprintf(stderr, "postfix: %s\n", postfix);
 
 	if(postfix == NULL)
 		return NULL;
 
+	nfa = malloc(sizeof(NFA));
+	if (nfa == NULL)
+		return NULL;
+	nfa->num_states = 1;  /* The final (matching) state. */
+
 	#define push(s) *stackp++ = s
 	#define pop() *--stackp
 
@@ -255,6 +348,7 @@ post2nfa(char *postfix)
 		switch(*p){
 		default:
 			s = state(*p, NULL, NULL);
+			nfa->num_states++;
 			push(frag(s, list1(&s->out)));
 			break;
 		case '.':	/* catenate */
@@ -267,22 +361,26 @@ post2nfa(char *postfix)
 			e2 = pop();
 			e1 = pop();
 			s = state(Split, e1.start, e2.start);
+			nfa->num_states++;
 			push(frag(s, append(e1.out, e2.out)));
 			break;
 		case '?':	/* zero or one */
 			e = pop();
 			s = state(Split, e.start, NULL);
+			nfa->num_states++;
 			push(frag(s, append(e.out, list1(&s->out1))));
 			break;
 		case '*':	/* zero or more */
 			e = pop();
 			s = state(Split, e.start, NULL);
+			nfa->num_states++;
 			patch(e.out, s);
 			push(frag(s, list1(&s->out1)));
 			break;
 		case '+':	/* one or more */
 			e = pop();
 			s = state(Split, e.start, NULL);
+			nfa->num_states++;
 			patch(e.out, s);
 			push(frag(e.start, list1(&s->out1)));
 			break;
@@ -294,7 +392,8 @@ post2nfa(char *postfix)
 		return NULL;
 
 	patch(e.out, &matchstate);
-	return e.start;
+	nfa->start = e.start;
+	return nfa;
 #undef pop
 #undef push
 }
@@ -305,41 +404,38 @@ struct List
 	State **s;
 	int n;
 };
-List l1, l2;
-static int listid;
 
-void addstate(List*, State*);
-void step(List*, int, List*);
+typedef struct NState NState;
+struct NState
+{
+	struct List l;
+	int num_states;
+};
 
-/* Compute initial state list */
-List*
-startlist(State *start, List *l)
+static int
+inlist(List *l, State *s)
 {
-	l->n = 0;
-	listid++;
-	addstate(l, start);
-	return l;
+	int i;
+	for (i = 0; i < l->n; i++)
+		if (l->s[i] == s)
+			return 1;
+
+	return 0;
 }
 
 /* Check whether state list contains a match. */
 int
-ismatch(List *l)
+ismatch(NState *ns)
 {
-	int i;
-
-	for(i=0; i<l->n; i++)
-		if(l->s[i] == &matchstate)
-			return 1;
-	return 0;
+	return inlist(&ns->l, &matchstate);
 }
 
 /* Add s to l, following unlabeled arrows. */
-void
+static void
 addstate(List *l, State *s)
 {
-	if(s == NULL || s->lastlist == listid)
+	if(s == NULL || inlist(l, s))
 		return;
-	s->lastlist = listid;
 	if(s->c == Split){
 		/* follow unlabeled arrows */
 		addstate(l, s->out);
@@ -355,67 +451,79 @@ addstate(List *l, State *s)
  * to create next NFA state set nlist.
  */
 void
-step(List *clist, int c, List *nlist)
+step(NState *ns, int c)
 {
 	int i;
 	State *s;
+	List *srclist;
+	List dstlist;
 
-	listid++;
-	nlist->n = 0;
-	for(i=0; i<clist->n; i++){
-		s = clist->s[i];
+	/* Note that we encode input events as characters starting
+	 * from A.  (Yeah, it's kind of a hack.) */
+	c += 'A';
+
+	srclist = &ns->l;
+	dstlist.n = 0;
+	dstlist.s = alloca(ns->num_states * sizeof(State*));
+
+	/* Hack: The first state in the state list is _always_ the
+	 * start state.  That way, we match any input that has a
+	 * matching suffix. */
+	addstate(&dstlist, srclist->s[0]);
+
+	for(i=0; i<srclist->n; i++){
+		s = srclist->s[i];
 		if(s->c == c)
-			addstate(nlist, s->out);
+			addstate(&dstlist, s->out);
+	}
+
+	/* Copy dstlist back to srclist. */
+	srclist->n = dstlist.n;
+	memcpy(srclist->s, dstlist.s, ns->num_states * sizeof(State*));
+}
+
+NState*
+getstartstate(NFA *nfa)
+{
+	NState *start = malloc(sizeof(List));
+	State **s = malloc(nfa->num_states * sizeof(State*));
+	if (start == NULL || s == NULL){
+		free(start);
+		free(s);
+		return NULL;
 	}
+        start->l.n = 0;
+	start->l.s = s;
+	start->num_states = nfa->num_states;
+	addstate(&start->l, nfa->start);
+	return start;
 }
 
-/* Run NFA to determine whether it matches s. */
-int
-match(State *start, char *s)
+void
+freenstate(NState *ns)
 {
-	int i, c;
-	List *clist, *nlist, *t;
-
-	clist = startlist(start, &l1);
-	nlist = &l2;
-	for(; *s; s++){
-		c = *s & 0xFF;
-		step(clist, c, nlist);
-		t = clist; clist = nlist; nlist = t;	/* swap clist, nlist */
+	if (ns != NULL){
+		free(ns->l.s);
+		free(ns);
 	}
-	return ismatch(clist);
 }
 
-int
-main(int argc, char **argv)
+NFA*
+compile_re(const char* re, const char** names, int num_names)
 {
-	int i;
 	char *post;
-	State *start;
+	NFA *nfa;
 
-	if(argc < 3){
-		fprintf(stderr, "usage: nfa regexp string...\n");
-		return 1;
-	}
-	
-	post = re2post(argv[1]);
-	if(post == NULL){
-		fprintf(stderr, "bad regexp %s\n", argv[1]);
-		return 1;
+	post = re2post(re, names, num_names);
+	if (post == NULL){
+		return NULL;
 	}
 
-	start = post2nfa(post);
-	if(start == NULL){
-		fprintf(stderr, "error in post2nfa %s\n", post);
-		return 1;
-	}
-	
-	l1.s = malloc(nstate*sizeof l1.s[0]);
-	l2.s = malloc(nstate*sizeof l2.s[0]);
-	for(i=2; i<argc; i++)
-		if(match(start, argv[i]))
-			printf("%s\n", argv[i]);
-	return 0;
+	nfa = post2nfa(post);
+	if (nfa == NULL)
+		return NULL;
+
+	return nfa;
 }
 
 /* The original license for this file is provided here but does not
diff --git a/src/nfa.h b/src/nfa.h
new file mode 100644
index 0000000..01f572c
--- /dev/null
+++ b/src/nfa.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2010 Justin Seyster
+   Copyright (c) 2010 Erez Zadok
+   Copyright (c) 2010 Stony Brook University
+   Copyright (c) 2010 The Research Foundation of SUNY
+
+   This program is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License
+   and a copy of the GCC Runtime Library Exception along with this
+   program; see the files COPYING and COPYING.RUNTIME respectively.
+   If not, see <http://www.gnu.org/licenses/>. */
+
+/* The interface exposed to tracecut.c from our modified dfa1.c */
+
+struct NFA;
+struct NState;
+
+extern int ismatch (struct NState *ns);
+extern void step (struct NState *ns, int c);
+extern struct NState *getstartstate(struct NFA *nfa);
+extern void freenstate (struct NState *ns);
+extern struct NFA *compile_re(const char *re, const char **names,
+			      int num_names);
-- 
2.43.0