/* search.c
 */

/* string search routines */
 
/*		Copyright (c) 1981,1980 James Gosling		*/
 
/* Modified Aug. 12, 1981 by Tom London to include regular expressions
   as in ed.  RE stuff hacked over by jag to correct a few major problems,
   mainly dealing with searching within the buffer rather than copying
   each line to a separate array.  Newlines can now appear in RE's */

/* Ripped to shreds and glued back together to make a search package,
 * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
 * Changes include:
 *	Buffer, window, and mlisp stuff gone.
 *	Translation tables reduced to 1 table.
 *	Expression buffer is now dynamically allocated.
 *	Character classes now implemented with a bitmap.
 * Modified by David Canzi, Apr 1997:
 *	Check bounds on alternatives array.
 *	Correct spurious matching, eg. /: re: .*\bfoo/ matched ": re: bar".
 */

#include "EXTERN.h"
#include "common.h"
#include "util.h"
#include "util2.h"
#include "INTERN.h"
#include "search.h"

#ifndef BITSPERBYTE
#define BITSPERBYTE 8
#endif

#define BMAPSIZ (127 / BITSPERBYTE + 1)

#define MNULL	64		/* Bit is set in a meta-character defn to
				   indicate that the metacharacter can match
				   a null string.  advance() uses this. */
 
/* meta characters in the "compiled" form of a regular expression */
#define CBRA	(2|MNULL)	/* \( -- begin bracket */
#define	CCHR	4		/* a vanilla character */
#define	CDOT	6		/* . -- match anything except a newline */
#define	CCL	8		/* [...] -- character class */
#define	NCCL	10		/* [^...] -- negated character class */
#define CDOL	(12|MNULL)	/* $ -- matches the end of a line */
#define CEND	(14|MNULL)	/* The end of the pattern */
#define CKET	(16|MNULL)	/* \) -- close bracket */
#define CBACK	(18|MNULL)	/* \N -- backreference to the Nth bracketed
				   string */
#define CIRC	(20|MNULL)	/* ^ matches the beginning of a line */

#define WORD	32		/* matches word character \w */
#define NWORD	34		/* matches non-word characer \W */
#define WBOUND	(36|MNULL)	/* matches word boundary \b */
#define NWBOUND	(38|MNULL)	/* matches non-(word boundary) \B */
 
#define	STAR	01		/* * -- Kleene star, repeats the previous
				   REas many times as possible; the value
				   ORs with the other operator types */
 
#define ASCSIZ 256
typedef Uchar	TRANSTABLE[ASCSIZ];

static TRANSTABLE trans;
static bool folding = FALSE;

static int err;
static char* FirstCharacter;

void
search_init()
{
    register int    i;
    
    for (i = 0; i < ASCSIZ; i++)
	trans[i] = i;
}

void
init_compex(compex)
register COMPEX* compex;
{
    /* the following must start off zeroed */

    compex->eblen = 0;
    compex->brastr = NULL;
}

void
free_compex(compex)
register COMPEX* compex;
{
    if (compex->eblen) {
	free(compex->expbuf);
	compex->eblen = 0;
    }
    if (compex->brastr) {
	free(compex->brastr);
	compex->brastr = NULL;
    }
}

static char* gbr_str = NULL;
static int gbr_siz = 0;

char*
getbracket(compex,n)
register COMPEX* compex;
int n;
{
    int length = compex->braelist[n] - compex->braslist[n];

    if (!compex->nbra)
	return NULL;
    if (n > compex->nbra || !compex->braelist[n] || length < 0)
	return nullstr;
    growstr(&gbr_str, &gbr_siz, length+1);
    safecpy(gbr_str, compex->braslist[n], length+1);
    return gbr_str;
}

void
case_fold(which)
int which;
{
    register int i;

    if (which != folding) {
	if (which) {
	    for (i = 'A'; i <= 'Z'; i++)
		trans[i] = tolower(i);
	}
	else {
	    for (i = 'A'; i <= 'Z'; i++)
		trans[i] = i;
	}
	folding = which;
    }
}

/* Compile the given regular expression into a [secret] internal format */

char*
compile(compex, strp, RE, fold)
register COMPEX* compex;
register char* strp;
int RE;
int fold;
{
    register int c;
    register char* ep;
    char* lastep;
    char  bracket[NBRA];
    char* bracketp;
    char** alt = compex->alternatives;
    char* retmes = "Badly formed search string";
 
    case_fold(compex->do_folding = fold);
    if (!compex->eblen) {
	compex->expbuf = safemalloc(84);
	compex->eblen = 80;
    }
    ep = compex->expbuf;		/* point at expression buffer */
    *alt++ = ep;			/* first alternative starts here */
    bracketp = bracket;			/* first bracket goes here */
    if (*strp == 0) {			/* nothing to compile? */
	if (*ep == 0)			/* nothing there yet? */
	    return "Null search string";
	return NULL;			/* just keep old expression */
    }
    compex->nbra = 0;			/* no brackets yet */
    lastep = 0;
    for (;;) {
	if (ep + 4 - compex->expbuf >= compex->eblen)
	    ep = grow_eb(compex, ep, alt);
	c = *strp++;			/* fetch next char of pattern */
	if (c == 0) {			/* end of pattern? */
	    if (bracketp != bracket) {	/* balanced brackets? */
#ifdef VERBOSE
		retmes = "Unbalanced parens";
#endif
		goto cerror;
	    }
	    *ep++ = CEND;		/* terminate expression */
	    *alt++ = 0;			/* terminal alternative list */
	    return NULL;		/* return success */
	}
	if (c != '*')
	    lastep = ep;
	if (!RE) {			/* just a normal search string? */
	    *ep++ = CCHR;		/* everything is a normal char */
	    *ep++ = c;
	}
	else				/* it is a regular expression */
	    switch (c) {
 
		case '\\':		/* meta something */
		    switch (c = *strp++) {
		    case '(':
			if (compex->nbra >= NBRA) {
#ifdef VERBOSE
			    retmes = "Too many parens";
#endif
			    goto cerror;
			}
			*bracketp++ = ++compex->nbra;
			*ep++ = CBRA;
			*ep++ = compex->nbra;
			break;
		    case '|':
			if (bracketp>bracket) {
#ifdef VERBOSE
			    retmes = "No \\| in parens";	/* Alas! */
#endif
			    goto cerror;
			}
			*ep++ = CEND;
			*alt++ = ep;
			if (alt > compex->alternatives + NALTS) {
#ifdef VERBOSE
				retmes = "Too many alternatives in reg ex";
#endif
				goto cerror;
			}
			break;
		    case ')':
			if (bracketp <= bracket) {
#ifdef VERBOSE
			    retmes = "Unmatched right paren";
#endif
			    goto cerror;
			}
			*ep++ = CKET;
			*ep++ = *--bracketp;
			break;
		    case 'w':
			*ep++ = WORD;
			break;
		    case 'W':
			*ep++ = NWORD;
			break;
		    case 'b':
			*ep++ = WBOUND;
			break;
		    case 'B':
			*ep++ = NWBOUND;
			break;
		    case '0': case '1': case '2': case '3': case '4':
		    case '5': case '6': case '7': case '8': case '9':
			*ep++ = CBACK;
			*ep++ = c - '0';
			break;
		    default:
			*ep++ = CCHR;
			if (c == '\0')
			    goto cerror;
			*ep++ = c;
			break;
		    }
		    break;
		case '.':
		    *ep++ = CDOT;
		    continue;
 
		case '*':
		    if (lastep == 0 || *lastep == CBRA || *lastep == CKET
			|| *lastep == CIRC
			|| (*lastep&STAR)|| *lastep>NWORD)
			goto defchar;
		    *lastep |= STAR;
		    continue;
 
		case '^':
		    if (ep != compex->expbuf && ep[-1] != CEND)
			goto defchar;
		    *ep++ = CIRC;
		    continue;
 
		case '$':
		    if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
			goto defchar;
		    *ep++ = CDOL;
		    continue;
 
		case '[': {		/* character class */
		    register int i;
		    
		    if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
			ep = grow_eb(compex, ep, alt); /* reserve bitmap */

		    for (i = BMAPSIZ; i; --i)
			ep[i] = 0;
		    
		    if ((c = *strp++) == '^') {
			c = *strp++;
			*ep++ = NCCL;	/* negated */
		    }
		    else
			*ep++ = CCL;	/* normal */
		    
		    i = 0;		/* remember oldchar */
		    do {
			if (c == '\0') {
#ifdef VERBOSE
			    retmes = "Missing ]";
#endif
			    goto cerror;
			}
			if (*strp == '-' && *(++strp) != ']' && *strp)
			    i = *strp++;
			else
			    i = c;
			while (c <= i) {
			    ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
			    if (fold && isalpha(c))
				ep[(c ^ 32) / BITSPERBYTE] |=
				    1 << ((c ^ 32) % BITSPERBYTE);
					/* set the other bit too */
			    c++;
			}
		    } while ((c = *strp++) != ']');
		    ep += BMAPSIZ;
		    continue;
		}
 
	    defchar:
		default:
		    *ep++ = CCHR;
		    *ep++ = c;
	    }
    }
cerror:
    compex->expbuf[0] = 0;
    compex->nbra = 0;
    return retmes;
}

char*
grow_eb(compex, epp, alt)
register COMPEX* compex;
char* epp;
char** alt;
{
    register char* oldbuf = compex->expbuf;
    register char** altlist = compex->alternatives;

    compex->eblen += 80;
    compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
    if (compex->expbuf != oldbuf) {	/* realloc can change expbuf! */
	epp += compex->expbuf - oldbuf;
	while (altlist != alt)
	    *altlist++ += compex->expbuf - oldbuf; 
    }
    return epp;
}

char*
execute(compex, addr)
register COMPEX* compex;
char* addr;
{
    register char* p1 = addr;
    register Uchar* trt = trans;
    register int c;
 
    if (addr == NULL || compex->expbuf == NULL)
	return NULL;
    if (compex->nbra) {			/* any brackets? */
	for (c = 0; c <= compex->nbra; c++)
	    compex->braslist[c] = compex->braelist[c] = NULL;
	if (compex->brastr)
	    free(compex->brastr);
	compex->brastr = savestr(p1);	/* in case p1 is not static */
	p1 = compex->brastr;		/* ! */
    }
    case_fold(compex->do_folding);	/* make sure table is correct */
    FirstCharacter = p1;		/* for ^ tests */
    if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
	c = trt[*(Uchar*)(compex->expbuf+1)]; /* fast check for first char */
	do {
	    if (trt[*(Uchar*)p1] == c && advance(compex, p1, compex->expbuf))
		return p1;
	    p1++;
	} while (*p1 && !err);
	if (err) err = 0;
	return NULL;
    }
    else {			/* regular algorithm */
	do {
	    register char** alt = compex->alternatives;
	    while (*alt) {
		if (advance(compex, p1, *alt++))
		    return p1;
	    }
	    p1++;
	} while (*p1 && !err);
	if (err) err = 0;
	return NULL;
    }
   /*NOTREACHED*/
}
 
/* advance the match of the regular expression starting at ep along the
   string lp, simulates an NDFSA */
bool
advance(compex, lp, ep)
register COMPEX* compex;
register char* ep;
register char* lp;
{
    register char* curlp;
    register Uchar* trt = trans;
    register int i;
 
    while (*lp || (*ep & (STAR|MNULL))) {
	switch (*ep++) {
 
	    case CCHR:
		if (trt[*(Uchar*)ep++] != trt[*(Uchar*)lp])
		    return FALSE;
		lp++;
		continue;
 
	    case CDOT:
		if (*lp == '\n') return FALSE;
		lp++;
		continue;
 
	    case CDOL:
		if (!*lp || *lp == '\n')
		    continue;
		return FALSE;
 
	    case CIRC:
		if (lp == FirstCharacter || lp[-1]=='\n')
		    continue;
		return FALSE;
 
	    case WORD:
		if (isalnum(*lp)) {
		    lp++;
		    continue;
		}
		return FALSE;
 
	    case NWORD:
		if (!isalnum(*lp)) {
		    lp++;
		    continue;
		}
		return FALSE;
 
	    case WBOUND:
		if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
			(!*lp || !isalnum(*lp)) )
		    continue;
		return FALSE;
 
	    case NWBOUND:
		if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
			(!*lp || !isalnum(*lp)))
		    continue;
		return FALSE;
 
	    case CEND:
		return TRUE;
 
	    case CCL:
		if (cclass(ep, *lp, 1)) {
		    ep += BMAPSIZ;
		    lp++;
		    continue;
		}
		return FALSE;
 
	    case NCCL:
		if (cclass(ep, *lp, 0)) {
		    ep += BMAPSIZ;
		    lp++;
		    continue;
		}
		return FALSE;
 
	    case CBRA:
		compex->braslist[(unsigned char)*ep++] = lp;
		continue;
 
	    case CKET:
		i = *ep++;
		compex->braelist[i] = lp;
		compex->braelist[0] = lp;
		compex->braslist[0] = compex->braslist[i];
		continue;
 
	    case CBACK:
		if (compex->braelist[i = *ep++] == 0) {
		    fputs("bad braces\n",stdout) FLUSH;
		    err = TRUE;
		    return FALSE;
		}
		if (backref(compex, i, lp)) {
		    lp += compex->braelist[i] - compex->braslist[i];
		    continue;
		}
		return FALSE;
 
	    case CBACK | STAR:
		if (compex->braelist[i = *ep++] == 0) {
		    fputs("bad braces\n",stdout) FLUSH;
		    err = TRUE;
		    return FALSE;
		}
		curlp = lp;
		while (backref(compex, i, lp)) {
		    lp += compex->braelist[i] - compex->braslist[i];
		}
		while (lp >= curlp) {
		    if (advance(compex, lp, ep))
			return TRUE;
		    lp -= compex->braelist[i] - compex->braslist[i];
		}
		continue;
 
	    case CDOT | STAR:
		curlp = lp;
		while (*lp++ && lp[-1] != '\n') ;
		goto star;
 
	    case WORD | STAR:
		curlp = lp;
		while (*lp++ && isalnum(lp[-1])) ;
		goto star;
 
	    case NWORD | STAR:
		curlp = lp;
		while (*lp++ && !isalnum(lp[-1])) ;
		goto star;
 
	    case CCHR | STAR:
		curlp = lp;
		while (*lp++ && trt[*(Uchar*)(lp-1)] == trt[*(Uchar*)ep]) ;
		ep++;
		goto star;
 
	    case CCL | STAR:
	    case NCCL | STAR:
		curlp = lp;
		while (*lp++ && cclass(ep, lp[-1], ep[-1] == (CCL | STAR))) ;
		ep += BMAPSIZ;
		goto star;
 
	star:
		do {
		    lp--;
		    if (advance(compex, lp, ep))
			return TRUE;
		} while (lp > curlp);
		return FALSE;
 
	    default:
		fputs("Badly compiled pattern\n",stdout) FLUSH;
		err = TRUE;
		return -1;
	}
    }
    return FALSE;
}
 
bool
backref(compex, i, lp)
register COMPEX* compex;
register int i;
register char* lp;
{
    register char* bp;
 
    bp = compex->braslist[i];
    while (*lp && *bp == *lp) {
	bp++;
	lp++;
	if (bp >= compex->braelist[i])
	    return TRUE;
    }
    return FALSE;
}

bool
cclass(set, c, af)
register char* set;
register int c;
int af;
{
    c &= 0177;
#if BITSPERBYTE == 8
    if (set[c >> 3] & 1 << (c & 7))
#else
    if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
#endif
	return af;
    return !af;
}


syntax highlighted by Code2HTML, v. 0.9.1