ports//mail/zmailer/work/zmailer-2.99.56/lib/rfc822scan.c

/*
 *	Copyright 1990 by Rayan S. Zachariassen, all rights reserved.
 *	This will be free software, but only when it is finished.
 *
 *	Fixes done by Matti Aarnio <mea@nic.funet.fi>, and at least
 *	Zack at <zack@bitmover.com>.
 */

#include "hostenv.h"
#include "mailer.h"
#include "libz.h"

/* Start of the scanner */

/* scanner tables: definition of character classes according to RFC822 */

#define _h	  01	/* header field */
#define _w	  02	/* linear white-space character (space / htab) */
#define _d	  04	/* digit */
#define _c	 010	/* control */
#define _a	 020	/* alphabetic */
#define _l	 040	/* line feed */
#define _r	0100	/* carriage return */
#define _s	0200	/* specials */
#define _8      0400	/* 8th bit on -- illegal on Headers! */

/* ISO Latin 1 (8859) */

#if defined(__alpha)||defined(__alpha__)
/* On Alpha ``short'' is slow to access! (this array is modified!) */
int
#else
/* All other systems are assumed to contain short-load/store instructions */
short
#endif
	rfc_ctype[256] = {					/* octalcode */
_c,	_c,	_c,	_c,	_c,	_c,	_c,	_c,	/*   0 -   7 */
_c,	_c|_w,	_l|_c,	_c,	_c,	_r|_c,	_c,	_c,	/*  10 -  17 */
_c,	_c,	_c,	_c,	_c,	_c,	_c,	_c,	/*  20 -  27 */
_c,	_c,	_c,	_c,	_c,	_c,	_c,	_c,	/*  30 -  37 */
_w,	_h,	_s|_h,	_h,	_h,	_h,	_h,	_h,	/*  40 -  47 */
_s|_h,	_s|_h,	_h,	_h,	_s|_h,	_h,	_s|_h,	_h,	/*  50 -  57 */
_d|_h,	_d|_h,	_d|_h,	_d|_h,	_d|_h,	_d|_h,	_d|_h,	_d|_h,	/* '0' - '7' */
_d|_h,	_d|_h,	_s,	_s|_h,	_s|_h,	_h,	_s|_h,	_h,	/* '8' -  77 */
_s|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	/* '@' - 'G' */
_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	/* 'H' - 'O' */
_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	/* 'P' - 'X' */
_a|_h,	_a|_h,	_a|_h,	_s|_h,	_s|_h,	_s|_h,	_h,	_h,	/* 'Y' - '_' */
_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	/* '`' - 'g' */
_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	/* 'h' - 'o' */
_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	_a|_h,	/* 'p' - 'x' */
_a|_h,	_a|_h,	_a|_h,	_h,	_h,	_h,	_h,	_c,	/* 'y' - 177 */
	/* The class assignments of the second half are all ILLEGAL */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 200 - 207 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 210 - 217 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 220 - 227 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 230 - 237 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 240 - 247 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 250 - 257 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 260 - 267 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 270 - 277 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 300 - 307 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 310 - 317 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 320 - 327 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 330 - 337 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 340 - 347 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 350 - 357 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8,	/* 360 - 367 */
_8,	_8,	_8,	_8,	_8,	_8,	_8,	_8	/* 370 - 377 */
};


/*
 *  Support function for Router's  $(condquote ...) a.k.a $(dequote ...)
 *  function.
 */

int rfc822_mustquote(s, spc)
	register const char *s;
	const int spc;
{
	int inquote = 0, mustquote = 0, hasquotes = 0;
	int c;

	int spckosher = 0;

	if ((!(rfc_ctype[spc & 0xFF] & (_w|_c|_8|_s)))
	    || spc == '.')
	  spckosher = 1;

	for (; *s; ++s) {
	  c = *(const unsigned char *)s;
	  if (c == '"') {
	    hasquotes = 1;
	    inquote = !inquote;
	    continue;
	  }
	  if (c == '\\') {
	    /* special, thus must be checked before set lookups below */
	    /* This is part of a quoted pair, if there is next char,
	       pick it unchanged */
	    const char *s2 = s+1;
	    if (*s2) s = s2;
	    continue;
	  }

	  if (c == '@' && !inquote)
	    /* This special outside a quote is ok */
	    continue;

	  if (c == ':') {
	    /* Special, but  "HOST::USER"@gwhost   syntax is
	       even more special ... */
	    if (s[1] == ':' && inquote) mustquote = 1;
	    continue;
	  }

	  if (c == '.' || c == ',' || c == '[' || c == ']')
	    /* specials, thus must be checked before set lookups below */
	    continue;

	  if (c == ' ' && spckosher)
	    /* We are asked to consider replacing SPACEs with a new char,
	       if the result is kosher, no need to quote after it */
	    continue;

	  if ((rfc_ctype[c] & (_w|_c|_8|_s)) || (c == '|')) {
	    mustquote = 1;
	  }
	}
	return hasquotes | (inquote ? 2 : 0) | (mustquote ? 4 : 0);
}


/*
 * Tell whether we are looking at a new header line, a continuation line,
 * or if we are done with the header. Return the number of characters in
 * the name of the header, or 0 if a continuation, or < 0 if end of header.
 * If non-0, the cardinality of the number returned is the length of the
 * header field name.
 */

#if 1

/* NOTE: The 'octo' variable is from long ago, dead code.. */


int
hdr_status(cp, lbuf, n, octo)
	register const char *cp, *lbuf;
	int	n, octo;
{
	if (*cp == ' ' || *cp == '\t') {
	  while ((cp < lbuf + n) && (rfc_ctype[(*cp) & 0xFF] & (_w|_l)))
	    ++cp;
#if 0 /* [mea] Lets consider all-white-space line a header continuation...
	 We process per RFC 822 rules, not BITNET 80char fixed width.. */

	  if (cp == lbuf + n)
	    /* a line containing only whitespace is EOH */
	    return -1;
#endif
	  /* a continuation line (folded header) */
	  return 0;
	}

	while ((cp < lbuf + n) && (rfc_ctype[(*cp) & 0xFF] & _h))  ++cp;

	if ((cp < lbuf + n) &&
	    (*cp == ':') /*&& (cp > cpin)*/)	/* header line */
	  return cp - lbuf;
	/* if we get to here, we have a malformed header line */
	/* if (*cp == ':' && cp == cpin) return -1; */
	return lbuf - cp;
}

#else

/* DEAD CODE:  From time when router did magic things to implement
   a semi-optimized alias database compilation... */

int
hdr_status(cp, lbuf, n, octo)
	register const char *cp, *lbuf;
	int	n, octo;
{
	if (*cp == ' ' || *cp == '\t') {
	  while ((cp < lbuf + n) && (rfc_ctype[(*cp) & 0xFF] & (_w|_l)))
	    ++cp;
	  if (cp == lbuf + n)
	    /* a line containing only whitespace is EOH */
	    return -1;
	  /* a continuation line (folded header) */
	  return 0;
	}
	if (!octo) {
	  while ((cp < lbuf + n) && (rfc_ctype[(*cp) & 0xFF] & _h))
	    ++cp;
	  if ((cp < lbuf + n) &&
	      (*cp == ':') /*&& (cp > cpin)*/)	/* header line */
	    return cp - lbuf;
	  /* if we get to here, we have a malformed header line */
	  /* if (*cp == ':' && cp == cpin) return -1; */
	  return lbuf - cp;
	} else {
	  /* complex calling relations -- we are parsing alias database,
	     and we want to have spaces allowed in the left-hand side.. */
	  char quote = 0;
	  while (cp < lbuf + n) {
	    char c = *cp;
	    if (c == '\\') {
	      ++cp;
	      if (cp >= (lbuf + n))
		break;
	    }
	    if (c == quote)
	      quote = 0;
	    else if (c == '"')
	      quote = '"';
	    else if (!quote && !(rfc_ctype[c & 0xFF] & _h))
	      break;
	    ++cp;
	  }
	  if (cp < lbuf + n && *cp == ':')
	    return cp - lbuf;
	  return lbuf - cp;
	}
}
#endif /* ... dead code */

#if 0
#define MKERROR(msg,prevp)	tn = makeToken((msg), strlen(msg)); \
				tn->t_type = Error; \
				tn->t_next = *(prevp); \
				*(prevp) = tn;
#else
static void
MKERROR(msg, prevp)
     const char *msg;
     token822 **prevp;
{
  token822 *tn = makeToken((msg), strlen(msg));
  tn->t_type = Error;
  tn->t_next = *(prevp);
  *(prevp) = tn;
}
#endif

/*
 * Recognize a compound token, or rather, a token which is defined by
 * matching start and end delimiters. A comment or quoted string is
 * the typical example. Comments may be recursive.
 */

static u_long _hdr_compound __((const char *cp, int *np,
				int cstart, int cend,
				const char **cpp,
				TokenType type, token822 *tp,
				token822 **tlist, token822 **tlistp));

static u_long
_hdr_compound(cp, np, cstart, cend, cpp, type, tp, tlist, tlistp)
	register const char *cp;
	int	*np;
	int	cstart, cend;
	const char	**cpp;
	TokenType	type;
	token822	*tp, **tlist, **tlistp;
{
	int nest = 1;
	int len = 1;
	int n = *np;

	if (*cp != cstart)
		abort(); /* Sanity check!  Call fault! */
	++cp, --n;

nextline:
	for (; n > 0; ++cp, --n, ++len) {
		if (*cp == cend) {
			if (--nest <= 0) {
			    break;
			}
		} else if (*cp == cstart) {
			if (type == Comment)
				++nest;
			else {
				MKERROR("illegal char in compound", tlist);
			}
		} else if (*cp == '\\') {
			if (n == 1) {
				MKERROR("missing character after backslash",
					tlist);
				/* Continue with next line, if existing! */
				n = 0;
				break;
			}
			++cp;
			--n;
			++len;
		} else if (*cp == '\r') {
			/* type = Error; */
			MKERROR("illegal CR in token", tlist);
		}
	}
	/* we either found cend, or ran off the end, either may be within
	   a recursion */
	if (n == 0) { /* we ran off the end */
		char msgbuf[50];

		if (tlistp != NULL && *tlistp != NULL
		    && (*tlistp)->t_next != NULL) {
			/* compound token is continued on next line */
			*tlistp = (*tlistp)->t_next;
			n = TOKENLEN(*tlistp);
			cp = (*tlistp)->t_pname;
			++len;
			goto nextline;
		}
		/* type=Error; */	/* hey, no reason to refuse a message*/
		sprintf(msgbuf, "missing closing '%c' in token", cend);
		MKERROR(msgbuf, tlist);
		tp->t_pname = NULL;	/* ugly way of signalling scanner */
	} else if (*cp == cend) {	/* we found matching terminator */
		++len;
		--n;			/* move past terminator */
	} else {	/* there was an error */
	  abort() ; /* ??? some sort of sanity check ? */
	}
	tp->t_type = type;
	tp->t_len = len;
	*np = n;
	*cpp = (char*)cp;
	return len;
}

/* Unfold (see RFC822) the contents of a compound token */

static const char * _unfold __((TokenType, int, const char *, const char **, token822*));
static const char *
_unfold(type, len, start, cpp, t)
	TokenType type;
	int len; /* Total length to unfold */
	const char *start;
	const char **cpp;
	token822 *t;
{
	char *s, *cp;
	const char *cpe = *cpp;

	/* Start and End may be at different tmalloc()ed objects! */

	s = cp = (char *)tmalloc(len +1);
	while (len > 0 && start != cpe) {
		if (*start != 0) {
		  if (*start == '\n') {
		    ++start;
		    --len;
		    continue;
		  }
		  --len;
		  *s++ = *start++;
		} else {
		  t = t->t_next;
		  start = t->t_pname;
#if 0 /* zero: unfold.. */
		  *s++ = '\n';
#else
		  /* Skip all folding white-space */
		  while (len > 0 && start != cpe &&
			 (*start == ' '  || *start == '\t' ||
			  *start == '\n' || *start == '\r'))
		    ++start, --len;
		  /* And replace it with *one* space */
		  *s++ = ' ';
#endif
		  --len;
		}
	}
	*cpp = start +1;
	if (type == Comment) {
	  /* Strip trailing spaces at comments */
	  while (s > cp && s[-1] == ' ') --s;
	}
	*s = '\0';
	return cp;
}

/*
 * The Scanner.
 *
 * cpp		- pointer to pointer to string.
 * n		- number of characters left in string.
 * c1, c2	- if non-NUL, these characters should be considered Special.
 *
 * The scanner will return a token list corresponding to the n next characters
 * in the string. Originally only a single token was returned per call, but
 * for efficiency this was changed to avoid function call overhead. The tokens
 * returned are classified by type (TokenType enum class).
 */
token822 * scan822(cpp, nn, c1, c2, allowcomments, tlistp)
	const char **cpp;		/* pointer to pointer to text */
	size_t	nn;			/* number of characters to scan */
	int	c1, c2;			/* temporary specials */
	int allowcomments;		/* #prefix tokens are comments to EOT */
	token822 **tlistp;		/* continuation line tokens if any */
{
	register const char *cp;
	static token822  t;
	token822	*tlist, *tp, *tn, *ot;
	char	msgbuf[50];
	short	ct, sc1, sc2;
	int n = (int) nn;

	if (n == 0)
		return NULL;
	sc1 = sc2 = '\0';
	if (c1 != '\0') {
		sc1 = rfc_ctype[c1 & 0xFF];
		rfc_ctype[c1] |= _s;
	}
	if (c2 != '\0') {
		sc2 = rfc_ctype[c2 & 0xFF];
		rfc_ctype[c2 & 0xFF] |= _s;
	}
	tlist = NULL;
	do {
		cp = *cpp;
		ct = rfc_ctype[(*cp) & 0xFF];
		t.t_len = n;
		t.t_pname = cp;
		if (ct & _w) {		/* LWSP without the CR LF part */
			while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _w))
			  continue;
			t.t_type = Space;
		} else if (ct & _r) {	/* >= 1 CR followed by LFs is a fold */
			while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _r))
			  continue;
			if (n == 0 || !(rfc_ctype[(*cp) & 0xFF] & _l)) {
			  strcpy(msgbuf, "CR without LF (newline)");
			  MKERROR(msgbuf, &tlist);
			} else if (n > 1 && (rfc_ctype[(*cp) & 0xFF] & _l)) {
			  while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _l))
			    continue;
			  strcpy(msgbuf,"too many newlines (LFs) in field[1]");
			  MKERROR(msgbuf, &tlist);
			}
			t.t_type = Fold;
		} else if (ct & _l) {	/* >= 1 LFs without CR is a fold too */
			while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _l))
			  continue;
			strcpy(msgbuf,"too many newlines (LFs) in field[2]");
			MKERROR(msgbuf, &tlist);
			t.t_type = Fold;
		} else if ((ct & _s) && (*cp=='(' || *cp=='"' || *cp=='[')) {
			TokenType	type;
			char	cend;
			int len;

			if (*cp == '"') {
			  cend = '"';
			  type = String;
			} else if (*cp == '[') {
			  cend = ']';
			  type = DomainLiteral;
			} else {
			  cend = ')';
			  type = Comment;
			}
			ot = (tlistp == NULL ? NULL : *tlistp);
			len = _hdr_compound(cp, &n, *cp, cend, cpp,
					    type, &t, &tlist, tlistp);
			if (ot != NULL && tlistp != NULL && ot != *tlistp) {

			  /* a compound token crossed line boundary */
			  /* copy from ++cp for len chars */
			  t.t_pname = _unfold(type, len, ++cp, cpp, ot);
			  t.t_len   = strlen(t.t_pname);
			} else {
			  if (t.t_pname != NULL)
			    /* magic sign; NULL: no ending char */
			    --t.t_len, ++(*cpp);
				/* past first bracketing char */
			  --t.t_len;  /* ++(*cpp); */
			  t.t_pname = ++cp;
			}

			/* compensate for calculations below */
			(*cpp)  -= t.t_len;
			t.t_len += n;

		} else if (ct & _s) {		/* specials */
			/* Double-colons as with DECNET */
			if (n > 1 && *cp == ':' && cp[1] == ':')
				--n;
			/* Backslash + special:  \@ \! \: ... */
			if (n > 1 && *cp == '\\' && cp[1] != 0 &&
			    (rfc_ctype[cp[1] & 0xFF] & _s))
				--n;
			--n;
			t.t_type = Special;
		} else if (!(ct & (_c|_8))) {	/* atom */
			while (--n > 0 &&
			       !(rfc_ctype[(*++cp) & 0xFF]&(_w|_s|_c|_l|_r)))
				continue;
			t.t_type = Atom;
		} else {
			int bit8 = 0;
			while (--n > 0 &&
			       (rfc_ctype[(*++cp) & 0xFF] & (_c|_8)))
				if (rfc_ctype[(*cp) & 0xFF] & _8) {
					bit8 = 1;
					break;
				}
			if (bit8)
				strcpy(msgbuf, "illegal 8-bit/control character");
			else
				strcpy(msgbuf, "illegal control character");
			if (t.t_len > n+1)
				strcat(msgbuf, "s");
			MKERROR(msgbuf, &tlist);
			t.t_type = Atom;
		}
		t.t_len -= n;
		/* return two values */
		*cpp += t.t_len;
		if (allowcomments && t.t_len >= 1 && t.t_pname[0] == '#') {
			*cpp += n;
			break;
		}
		t.t_next = tlist;
		if (t.t_len > 0)
			tlist = copyToken(&t);
		else {
			t.t_pname = "";
			t.t_len   = 0;
			tlist = copyToken(&t);
		}
	} while (n > 0);

	/* Reverse the token822 chain */
	tp = tn = NULL;
	for (tp = NULL; tlist != NULL; tlist = tn) {
		tn = tlist->t_next;
		tlist->t_next = tp;
		tp = tlist;
	}
	if (c1 != '\0') rfc_ctype[c1 & 0xFF] = sc1;
	if (c2 != '\0') rfc_ctype[c2 & 0xFF] = sc2;
	return tp;
}

/*
 * The UTEXT Scanner.
 *
 * cpp		- pointer to pointer to string.
 * n		- number of characters left in string.
 *
 * The scanner will return a token list corresponding to the n next characters
 * in the string. Originally only a single token was returned per call, but
 * for efficiency this was changed to avoid function call overhead. The tokens
 * returned are classified by type (TokenType enum class).
 */
token822 * scan822utext(cpp, nn, tlistp)
	const char **cpp;		/* pointer to pointer to text */
	size_t	nn;			/* number of characters to scan */
	token822 **tlistp;		/* continuation line tokens if any */
{
	register const char *cp;
	static token822  t;
	token822	*tlist, *tp, *tn;
	char	msgbuf[50];
	short	ct;
	int n = (int) nn;

	if (n == 0)
		return NULL;

	tlist = NULL;
	do {
		cp = *cpp;
		ct = rfc_ctype[(*cp) & 0xFF];
		t.t_len = n;
		t.t_pname = cp;
		if (ct & _w) {		/* LWSP without the CR LF part */
			while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _w))
			  continue;
			t.t_type = Space;
		} else if (ct & _r) {	/* >= 1 CR followed by LFs is a fold */
			while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _r))
			  continue;
			if (n == 0 || !(rfc_ctype[(*cp) & 0xFF] & _l)) {
			  strcpy(msgbuf, "CR without LF (newline)");
			  MKERROR(msgbuf, &tlist);
			} else if (n > 1 && (rfc_ctype[(*cp) & 0xFF] & _l)) {
			  while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _l))
			    continue;
			  strcpy(msgbuf,"too many newlines (LFs) in field[1]");
			  MKERROR(msgbuf, &tlist);
			}
			t.t_type = Fold;
		} else if (ct & _l) {	/* >= 1 LFs without CR is a fold too */
			while (--n > 0 && (rfc_ctype[(*++cp) & 0xFF] & _l))
			  continue;
			strcpy(msgbuf,"too many newlines (LFs) in field[2]");
			MKERROR(msgbuf, &tlist);
			t.t_type = Fold;
		} else {
			/* Anything else is unstructured foldable Atom */
			while (--n > 0 &&
			       !(rfc_ctype[(*++cp) & 0xFF]&(_w|_l|_r)))
				continue;
			t.t_type = Atom;
		}
		t.t_len -= n;
		/* return two values */
		*cpp += t.t_len;

		t.t_next = tlist;
		if (t.t_len > 0)
			tlist = copyToken(&t);
		else {
			t.t_pname = "";
			t.t_len   = 0;
			tlist = copyToken(&t);
		}
	} while (n > 0);

	/* Reverse the token822 chain */
	tp = tn = NULL;
	for (tp = NULL; tlist != NULL; tlist = tn) {
		tn = tlist->t_next;
		tlist->t_next = tp;
		tp = tlist;
	}
	return tp;
}
syntax highlighted by Code2HTML, v. 0.9.1