/*
 *	Copyright 1989 by Rayan S. Zachariassen, all rights reserved.
 *	This will be free software, but only when it is finished.
 */

/*
 * Filename expansion (globbing) routines.  The expand() function is called
 * just before pushing an argv onto a command descriptor in the interpreter.
 * As a side-effect, multiple buffers are mashed into one, and word separation
 * using IFS characters is also done here.
 */

#include "hostenv.h"
#include <stdio.h>
#include <sys/file.h>
#include <sys/stat.h>

#ifdef HAVE_DIRENT_H
# include <dirent.h>
#else /* not HAVE_DIRENT_H */
# define dirent direct
# ifdef HAVE_SYS_NDIR_H
#  include <sys/ndir.h>
# endif /* HAVE_SYS_NDIR_H */
# ifdef HAVE_SYS_DIR_H
#  include <sys/dir.h>
# endif /* HAVE_SYS_DIR_H */
# ifdef HAVE_NDIR_H
#  include <ndir.h>
# endif /* HAVE_NDIR_H */
#endif /* HAVE_DIRENT_H */

#include "sh.h"
#include "flags.h"
#include "zmalloc.h"
#include "listutils.h"
#include "io.h"			/* redefines stdio routines */
#include "shconfig.h"

#include "libsh.h"

/*
 * For speed we look up magic characters (*, ?, and [) to check whether
 * a word requires filename globbing.  The globchars array contains such flags.
 */

char globchars[CHARSETSIZE];

/*
 * Initialize the array, called from main(), once.
 */

void
glob_init()
{
	int i;
	for (i = 0; i < CHARSETSIZE; ++i)
	  globchars[i] = 0;

	globchars['*'] = globchars['?'] = globchars['['] = 1;
	/* see also sJumpIfMatch in interpreter for globchars['|'] */
}

/*
 * Because the result of filename expansion must be presented in alphabetical
 * order, we have to keep the pathnames somewhere temporarily to do a qsort().
 * Since we don't know how many there will be, we maintain a linked list of
 * (struct sawpath), then turn that into an array that we can sort easily.
 */

struct sawpath {
	struct sawpath *next;
	char	array[1];	/* is really array[strlen(path)+1] */
};


STATIC int glut __((char *cwd, char *pwd, int *pb, int recur, struct sawpath **swp));

/*
 * Normally we don't want to descend into symlink'ed directories, but in
 * case you do, this is where you change it.  It is INADVISABLE to use
 * stat() since in that case a super-glob can lead to infinite recursion.
 */

#ifdef	HAVE_LSTAT
extern int lstat();
static int (*statfcn)() = lstat;	/* stat if following symlinks, o/w lstat */
#else	/* !HAVE_LSTAT */
extern int stat();
static int (*statfcn)() = stat;
#endif	/* HAVE_LSTAT */

/*
 * Qsort() comparison function to sort pathnames.
 */

int
pathcmp(ap, bp)
     const void *ap, *bp;
{
	register const void **a = (const void **)ap;
	register const void **b = (const void **)bp;

	/* sort to reverse order, easier to collate
	   into an object chain */

	return -strcmp(*a, *b);
}

/* note that | is going to be used instead of / in some pathname examples */

/*
 * Super-glob.  This is exactly like glob except that the sequence: |**|
 * in a pathname will match any number of levels of directories.  Its an
 * old idea of mine so here's a wonderful opportunity to express myself within
 * the confines of sh!  The major thing to note is that quoted characters
 * do not qualify for globbing.  How does one know if a character is quoted?
 * Each character is stored in an int, the u_char value is obtained using
 * the BYTE() macro, and the quotedness is determined by the QUOTEBYTE bit
 * in the int.
 */

#define BYTE(X)		((X) & 0xFF)
#define	QUOTEBYTE	(0x8000)

STATIC conscell * sglob __((int *));

STATIC conscell *
sglob(ibuf)
	int *ibuf;	/* unglobbed pathname w/ each byte stored as int */
{
	register int i, n;
	conscell *pp, *cc = NULL;
	struct sawpath *spp;
	struct sawpath head;
	char	*pwd,		/* points to end of current directory in cwd */
		**base,		/* array of expanded filenames, for sorting */
		cwd[4096];	/* all pathnames are constructed in cwd */
	GCVARS1;

	if (BYTE(ibuf[0]) == '/') {
		cwd[0] = '/';
		pwd = cwd+1;
	} else if (BYTE(ibuf[0]) == '.' && BYTE(ibuf[1]) == '/') {
		cwd[0] = '.';
		cwd[1] = '/';
		pwd = cwd+2;
		ibuf += 2;
	} else
		pwd = cwd;

	*pwd = '\0';
	head.next = NULL;
	spp = &head;
	if (BYTE(ibuf[0]) != 0)
	  n = glut(cwd, pwd, ibuf, 0, &spp);	/* do expansion */
	else
	  goto leave;
	
	if (n <= 0)
	  goto leave;

#ifdef USE_ALLOCA
	base = (char **)alloca((sizeof (char *))*n);
#else
	base = (char **)malloc((sizeof (char *))*n);
#endif
	i = 0;
	for (spp = head.next; spp != NULL ; spp = spp->next)
		base[i++] = spp->array;
	qsort(base, n, sizeof base[0], pathcmp);
	/* construct a sorted linked list */
	cc = NULL;
	GCPRO1(cc);
	for (i = 0; i < n; ++i) {
		int slen = strlen(base[i]);
		pp = newstring(dupnstr(base[i],slen),slen);
		cdr(pp) = cc;
		cc = pp;
		/* printf("saw %s\n", base[i]); */
	}
	UNGCPRO1;
#ifndef USE_ALLOCA
	free(base);
#endif
 leave:
	spp = head.next;
	while (spp != NULL) {
	  struct sawpath *spp2 = spp->next;
	  free(spp);
	  spp = spp2;
	}
	return cc;
}

/*
 * Utility function to append a pathname to a linked list for later sorting.
 */

STATIC struct sawpath * stash __((const void *, int, struct sawpath *));
STATIC struct sawpath *
stash(s, len, ps)
	const void *s;		/* the pathname we want to stash away */
	int len;		/* its length */
	struct sawpath *ps;	/* the previous list element */
{
	register struct sawpath *spp;
	
	spp = (struct sawpath *)malloc(sizeof (struct sawpath) + len);
	ps->next = spp;
	spp->next = NULL;
	memcpy(spp->array, s, len);
	spp->array[len] = 0;
	return spp;
}

STATIC int kleene[] = { '*', '/', 0 };	/* foo|**|  becomes foo|**|*| */

/*
 * This routine is the recursive workhorse of the filename globbing.
 */

static int
glut(cwd, pwd, bp, recur, swp)
	char	*cwd,		/* always the same buffer */
		*pwd;		/* pointer to end of directories/ inside cwd */
	int	*bp,		/* next character in the name to glob */
		recur;		/* flag: superglob mode, deep dir. descend */
	struct sawpath **swp;
{
	int	*start,		/* beginning of simple file name to expand */
		*eoname,	/* end of same */
		*ip,
		havepattern,	/* the simple file name contains glob chars */
		flag,		/* remember to put the trailing / on cwd back */
		count,		/* number of expansions */
		namlen,		/* filename length */
		i;
	struct stat stbuf;
	struct dirent *dp;
	DIR *dirp = NULL;
	
	if (interrupted)
		return 0;
	if (pwd > cwd+1) {
		*--pwd = '\0';
		flag = 1;
	} else
		flag = 0;
	/* printf("%s:\n", cwd); */
	/*
	 * must do stat since assuming opendir will
	 * fail on files might not be portable.
	 */
	if ((*cwd == '\0' && statfcn(".", &stbuf) < 0) ||
	    (*cwd != '\0' && statfcn(cwd, &stbuf) < 0) ||
	    (stbuf.st_mode & S_IFMT) != S_IFDIR)
		return -1;
again:
	while (*bp != '\0' && BYTE(*bp) == '/')
		++bp;
	if (*bp == '\0') {
		if (recur)	/* we're at the end of the road of a |**| */
			bp = kleene;
		else {
			*swp = stash(cwd, (pwd - cwd), *swp);
			return 1;
		}
	}
	if (*bp == '*' && *(bp+1) == '*' && BYTE(*(bp+2)) == '/') {
		recur = 1;	/* superglob mode */
		bp += 2;
		if (*(bp+1))
			goto again;
		else
			++bp;	/* start and eoname will point at 0 */
	}
	start = bp;
	while (*bp != '\0' && BYTE(*bp) != '/')
		++bp;
	eoname = bp;

	/*
	 * Now we have a local name between start and eoname, relative to
	 * the directory we have open. search through the directory for
	 * that name, then do the whole thing again, recursively.
	 */
	
	havepattern = count = 0;
	/* optimization... is it worth globbing in inner loop far below? */
	for (ip = start; ip < eoname; ++ip)
		if (*ip == '*' || *ip == '?' || *ip == '[') {
			havepattern = 1;
			break;
		}
	if ((*cwd == '\0' && (dirp = opendir(".")) == NULL) ||
	    (*cwd != '\0' && (dirp = opendir((char *)cwd)) == NULL)) {
		perror((char *)cwd);
		return 0;
	}
	if (flag) {
		*pwd++ = '/';
		*pwd = '\0';
	}

	/* major loop */

	for (dp = readdir(dirp); dp != NULL; dp = readdir(dirp)) {

		/* a * won't match .files */
		if (*start == '*' && start == eoname - 1
		    && dp->d_name[0] == '.')
			continue;

		/* if we can't match the simple name, forget it */
		if (!recur &&
		    glob_match(start, eoname, dp->d_name) == 0)
			continue;
		strcpy((char *)pwd, dp->d_name);
		namlen = strlen(dp->d_name);
		if (recur && !(dp->d_name[0] == '.' && (dp->d_name[1] == '\0'
			|| (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))) {
			/* in superglob mode... glut returns -1 if is a file */
			i = glut(cwd, pwd+namlen+1, start, recur, swp);
			if (i >= 0) {
				count += i;
				strcpy((char *)pwd, dp->d_name);
				if (*start != 0 && start != kleene)
					continue;
				++count;
				*swp = stash(cwd, (int)((pwd+namlen)-cwd), *swp);
			}
		}
		if (*eoname == 0) {	/* end of the globbable name */
			/* we aren't interested in descending directories */
			if (!recur ||
			    glob_match(start, eoname, dp->d_name))
				++count,
				*swp = stash(cwd, (int)((pwd+namlen)-cwd), *swp);
		} else if (!recur) {
			/* we are only interested in directories */
			i = glut(cwd, pwd+namlen+1, eoname, recur, swp);
			count += (i > 0 ? i : 0);
		}
		if (recur || havepattern)
			continue;
		else
			break;
	}
#ifdef	BUGGY_CLOSEDIR
	/*
	 * Major serious bug time here;  some closedir()'s
	 * free dirp before referring to dirp->dd_fd. GRRR.
	 * XX: remove this when bug is eradicated from System V's.
	 */
	close(dirp->dd_fd);
#endif
	closedir(dirp);
	return count;
}

/*
 * This is a heavily recursive globbing function, it uses the string (which
 * is actually an int array as mentioned above) as a DFA that it interprets.
 * This routine should be kept in sync with the case-statement globbing
 * in the interpreter.  Unfortunately the requirements are different enough
 * that sharing code would be hard.
 */

int
glob_match(pattern, eopattern, s)
	register int	*pattern, *eopattern;
	register const char	*s;
{
	register int i, i2, sense;

	while (eopattern == NULL || pattern < eopattern) {
		switch (*pattern) {
		case '*':
			while (*pattern == '*')
				pattern++;
			do {
				if (glob_match(pattern, eopattern, s))
					return 1;
			} while (*s++ != '\0');
			return 0;
		case '[':
			if (*s == '\0')
				return 0;
			sense = (*(pattern+1) != '!');
			if (!sense)
			  ++pattern;
			while ((*++pattern != ']') && (*pattern != *s)) {
			  if (pattern == eopattern)
			    return !sense;
			  if (*(pattern+1) == '-'
			      && (i2 = BYTE(*(pattern+2))) != ']'
			      && i2 != 0) {
			    i2 = (i2 < 128) ? i2 : 127;
			    for (i = BYTE(*pattern)+1; i <= i2; i++)
			      if (i == *s) {
				if (sense)
				  goto ok;
				else
				  return 0;
			      }
			    pattern += 2;
			  }
			}
			if ((*pattern == ']') == sense)
				return 0;
ok:
			while (*pattern++ != ']')
				if (pattern == eopattern)
					return 0;
			s++;
			break;
		case '?':
			if (*s == '\0')
				return 0;
			s++;
			pattern++;
			break;
		case '\0':
			return (*s == '\0');
		default:
			if (BYTE(*pattern++) != *s++)
				return 0;
		}
	}
	return (*s == '\0');
}

/* 
 * Mash the linked list of buffers passed into a single buffer on return.
 */

int
squish(d, bufp, ibufp, doglob)
	conscell *d; /* input is protected */
	char **bufp;
	int **ibufp, doglob;
{
	register char *cp, *bp;
	register int *ip;
	register int sawglob, mask, len;
	register conscell *l;
	char *buf;
	int *ibuf;

	if ((LIST(d) || ISQUOTED(d)) && cdr(d) == NULL)
	  return -1;
	/* how much space will unexpanded concatenation of buffers take? */
	for (l = d, len = 0, sawglob = 0; l != NULL; l = cdr(l)) {
	  int slen = l->slen;
	  if (l->string == NULL || slen == 0)
	    continue;
	  len += slen;
	  if (doglob) {
	    /* We do glob processing */
	    cp = l->string;
	    if (!sawglob) {
	      while (slen > 0) {
		if (globchars[BYTE(*cp)] != 0 &&
		    !(cp == d->string &&
		      *cp == '[' && *(d->string+1) == '\0')) {
		  ++sawglob;
		  break;
		}
		++cp; --slen;
	      }
	    }
	  }
	}

	/* f option disables filename generation */
	sawglob = (sawglob != 0); /* zero/one value space */
	if (doglob > 0) /* doglob == -1 for 'case' matching */
	  if (isset('f')) sawglob = 0;

	if (!sawglob && cdr(d) == NULL) {
	  return -1;
	}

	/* allocate something large enough to hold integer per char */
	*bufp = buf = (char *)malloc((len+1)*(sawglob ? sizeof(int) : 1));
	*ibufp = ibuf = (int *)buf;

	for (l = d, bp = buf, ip = ibuf; l != NULL; l = cdr(l)) {
	  if (l->string) {
	    if (sawglob) {
	      int slen = l->slen;
	      /*
	       * Create int array with quoted characters
	       * marked by the QUOTEBYTE bit.
	       */
	      mask = ISQUOTED(l) ? QUOTEBYTE : 0;
	      for (cp = l->string; slen > 0; --slen,++cp) {
#if 0
		if (*cp == '\\' && *(cp+1) != '\0')
		  *ip++ = BYTE(*++cp) | QUOTEBYTE;
		else
#endif
		  *ip++ = BYTE(*cp)   | mask;
	      }
	    } else if (ISQUOTED(l)) {
	      if (l->slen > 0) {
		memcpy(bp, l->string, l->slen);
		bp += l->slen;
	      }
	    } else {
	      int slen = l->slen;
	      for (cp = l->string; slen > 0; ++cp, --slen) {
		if (*cp == '\\' && slen > 1)
		  ++cp;
		*bp++ = *cp;
	      }
	    }
	  }
	}
	if (sawglob)
	  *ip = 0;
	else
	  *bp = '\0';
	return sawglob;
}

/*
 * Given a buffer list, check all non-quoted non-list portions for
 * file globbing characters and if relevant perform filename expansion.
 * The caller relies on the return value never being NULL.
 */

STATIC conscell * csglob __((conscell *, int doglob));
STATIC conscell *
csglob(d, doglob)
	conscell *d; int doglob;
{
	register char *bp;
	register int *ip;
	conscell *tmp;
	char *buf = NULL;
	int *ibuf;
	int s, slen;

	s = squish(d, &buf, &ibuf, doglob);
	switch (s) {
	case -1:
		/* if (buf) free(buf); -- no allocations; ever */
		return d;
	case 1:
		tmp = sglob(ibuf);
		if (tmp != NULL) {
		  free(buf);
		  return tmp;
		}
		for (bp = buf, ip = ibuf; *ip != '\0'; ++ip)
			*bp++ = BYTE(*ip);
		*bp = '\0';
		/* FALLTHROUGH */
		/* (re)filled the buffer, can throw away
		   the conscell chain in 'tmp'. */
	case 0:
		slen = strlen(buf);
		tmp = newstring(dupnstr(buf,slen),slen);
		free(buf);
		break;
	default:
		abort();
	}

	return tmp;
}

extern conscell *
sh_glob(avl,il)
     conscell *avl, *il;
{
	il = cdar(avl);
	if (il == NULL || !STRING(il)) {
		fprintf(stderr, "Usage: %s glob-patter-string\n",
				car(avl)->string);
		return NULL;
	}
	/* Always do the glob, never mind if 'set -f' is in effect! */
	il = csglob(il, -1);

	for (avl = il; avl != NULL; avl = cdr(avl))
	  if (STRING(avl))
	    avl->flags |= (QUOTEDSTRING|ELEMENT);

	return il;
}


/*
 * This function is called with the unexpanded buffer contents (usually
 * a linked list of strings) just prior to it being added as a command
 * argument.  Expansion involves scanning unquoted strings for whitespace
 * (as defined by IFS) and breaking those apart into multiple argv's,
 * as well as filename globbing of the resulting unquoted strings.
 * The return value is a list of argv's.
 *
 * The input array is safe to be munched in place!  No need to copy it!
 */


conscell *
expand(d, variant)
	conscell *d; /* input protected */
	int variant;
{
	conscell *tmp, *head, *next, *orig;
	conscell *globbed = NULL, **pav;
	register char *cp, *cp0;
	int slen, slen0;
	GCVARS6;


	/* Ok, following is MAGIC - sVariablePush when the
	   data is $(elements ..) produced list */

	if (variant == 2 && ISELEMENT(d)) return d;



	tmp = head = next = orig = globbed = NULL;
	GCPRO6(tmp, head, next, orig, globbed, d);

	/* grindef("EXP = ", d); */

	/* *NOT* copying at first with s_copy_tree -- 13 % of total system
	   runtime is in THAT call from THIS function! */

#if 0
	d = s_copy_chain(d); /* Copy the LINEAR CHAIN, not the whole tree! */
#endif

	orig = d;
	pav = &globbed;
	for (head = d; d != NULL; d = next) {
		if (head == NULL)
			head = d;
		next = cdr(d);
		if (LIST(d) || ISQUOTED(d)) {
			continue;
		} else if (ISELEMENT(d)) {
			if (head != d) {
				cdr(head) = NULL;
				head = *pav = csglob(head,0);
				while (cdr(head) != NULL) head = cdr(head);
				pav = &cdr(head);
			}
			head = NULL;
			d->flags &= ~ELEMENT;
			*pav = d;
			pav = &cdr(d);
			continue;
		}
		/* null strings should be retained */
		/* fprintf(stderr,"checking '%s'\n", d->string); */
		cp0 = cp = d->string;
		slen0 = slen = d->slen;
		if (head == d) {
			/* skip leading whitespace */
			while (slen > 0 && WHITESPACE(*cp)) ++cp, --slen;
			cp0 = cp;
		}
		while (slen > 0) {
		  if (WHITESPACE(*cp)) {
		    /* Two possible cases: cp0 == d->string, which means
		       the stuff is from the beginning of the string, and
		       playing with d->slen does work.
		       The second case is when the input string contains
		       possibly several white space separated substrings:
		       "aa bb cc", and this is second white-space part, or
		       later.. */
		    if (cp0 == d->string) {
		      /* Variant first.. */
		      int s0len = d->slen;
		      d->slen = (cp - d->string);
		      ++cp;
		      d->slen = (slen0 - slen);
		      --slen;
		      cdr(d) = NULL;
		      /* wrap the stuff at head into its own argv */
		      /* printf("wrapped '%s'\n", d->string); */
		      tmp = *pav = csglob(head,0);
		      while (cdr(tmp)) tmp = cdr(tmp);
		      if (tmp == d) {
			/* Crap ! That stuff didn't expand */
			;
		      } else
			d->slen = s0len;
		      pav = & cdr(tmp);
		    } else {
		      /* Variant second... */
		      /* This means also that the previous text bit
			 IS 'head' -- or can be set as it! */
		      head = newstring(dupnstr(cp0,cp-cp0),cp-cp0);
		      cdr(head) = NULL;
		      tmp = *pav = csglob(head,0);
		      while (cdr(tmp)) tmp = cdr(tmp);
		      pav = & cdr(tmp);
		    }

		    /* now find the continuation */
		    while (slen > 0 && WHITESPACE(*cp))
		      ++cp, --slen;

		    if (slen == 0) {
		      cp0 = d->string; /* mark to ignore this string */
		      head = NULL;
		      break;
		    } else {
		      /* We have more non-white-space stuff
			 following */
		      cp0 = cp;
		    }
		  }
		  ++cp;
		  --slen;
		}
		if (d->string != cp0) {
		  /* went to the end, and did start in the middle of
		     the thing */
		  int sslen = slen0 - (cp0 - d->string);
		  head = newstring(dupnstr(cp0, sslen), sslen);
		}
	}

	if (head != NULL) {
		/* printf("trailing '%s'\n", head->string); */
		/* glob is guaranteed to not return NULL */
		head = *pav = csglob(head,0);
		while (cdr(head) != NULL) head = cdr(head);
		pav = &cdr(head);
	}
	*pav = NULL;

	UNGCPRO6;

	/* fprintf(stderr, "EXPLEN = %d ",globbed->slen);
	   grindef("EXPOUT = ", globbed); */

	return globbed;
}


syntax highlighted by Code2HTML, v. 0.9.1