/*
 * Copyright (c) 2002-2005 Sendmail, Inc. and its suppliers.
 *	All rights reserved.
 *
 * By using this file, you agree to the terms and conditions set
 * forth in the LICENSE file which can be found at the top level of
 * the sendmail distribution.
 */

#include "sm/generic.h"
SM_RCSID("@(#)$Id: rfc2822.c,v 1.26 2006/12/27 03:41:09 ca Exp $")
#include "sm/assert.h"
#include "sm/error.h"
#include "sm/rfc2821.h"
#include "sm/rfc2822.h"
#include "sm/heap.h"

/*

implement RFC 2822 address parsing.

more or less...
*/

static int
t2822_ready(sm_2822_a_T *x, uint n)
{
	uint i;

	if (x->sm_2822a_t)
	{
		i = x->sm_2822a_a;
		if (n > i)
		{
			x->sm_2822a_a = 30 + n + (n >> 3);
			x->sm_2822a_t = (sm_2822_P) sm_realloc(x->sm_2822a_t,
						x->sm_2822a_a * sizeof(sm_2822_T));
			if (x->sm_2822a_t != NULL)
				return 1;
			x->sm_2822a_a = i;
			return 0;
		}
		return 1;
	}
	x->sm_2822a_len = 0;
	x->sm_2822a_a = n;
	x->sm_2822a_t = (sm_2822_P) sm_malloc(n * sizeof(sm_2822_T));
	return x->sm_2822a_t != NULL;
}

static int
t2822_readyplus(sm_2822_a_T *x, uint n)
{
	uint i;

	if (x->sm_2822a_t)
	{
		i = x->sm_2822a_a;
		n += x->sm_2822a_len;
		if (n > i)
		{
			x->sm_2822a_a = 30 + n + (n >> 3);
			x->sm_2822a_t = (sm_2822_P) sm_realloc(x->sm_2822a_t,
						x->sm_2822a_a * sizeof(sm_2822_T));
			if (x->sm_2822a_t != NULL)
				return 1;
			x->sm_2822a_a = i;
			return 0;
		}
		return 1;
	}
	x->sm_2822a_len = 0;
	x->sm_2822a_a = n;
	x->sm_2822a_t = (sm_2822_P) sm_malloc(n * sizeof(sm_2822_T));
	return x->sm_2822a_t != NULL;
}

static int
t2822_append(sm_2822_a_T *x, sm_2822_P i)
{
	if (!t2822_readyplus(x, 1))
		return 0;
	x->sm_2822a_t[x->sm_2822a_len++] = *i;
	return 1;
}

static sm_2822_T comma = { T2822_COMMA, NULL, NULL, NULL };

static void
t2822_reverse(sm_2822_a_P ta)
{
	int i, n;
	sm_2822_T temp;

	n = ta->sm_2822a_len - 1;
	for (i = 0; i + i < n; ++i)
	{
		temp = ta->sm_2822a_t[i];
		ta->sm_2822a_t[i] = ta->sm_2822a_t[n - i];
		ta->sm_2822a_t[n - i] = temp;
	}
}

static bool
needspace(int t1, int t2)
{
	if (t1 == 0)
		return false;
	if (t1 == T2822_COLON ||
	    t1 == T2822_COMMA ||
	    t2 == T2822_LEFT)
		return true;

	/* fixme: put a macro in .h?? */
	if (t1 >= T2822_ATOM &&
	    t2 >= T2822_ATOM)
		return true;
	return false;
}

static const char delim[] = " \t\r\n([\"<>;:@,.";
#if 0
static const char t2822_opchar[] = "|\"(),.:;<>@[]%!";
#endif /* 0 */

int
atomtype(uchar ch)
{
	if ((ch < 32) || (ch > 126) ||
	    (ch == ')') || (ch == ']') || (ch == '\\'))
		return T2822_QUOTED;
	if (strchr(delim, (int) ch) != NULL)
		return -1;
	return T2822_ATOM;
}


int
t2822_unparse(sm_str_P sa, sm_2822_a_P ta)
{
	uint i, j;
	int ch, prevtype, curtype;
	sm_2822_P t;

#if 0
	int len;

	len = 0;
	prevtype = 0;

	/* calculate length */
	for (i = 0; i < ta->sm_2822a_len; ++i)
	{
		t = ta->sm_2822a_t + i;
		curtype = t->sm_2822_type;
		if (needspace(prevtype, curtype))
			++len;
		prevtype = curtype;
		switch(curtype)
		{
		  case T2822_COMMA:
			len += 3;	/* too much? */
			break;
		  case T2822_AT:
		  case T2822_DOT:
		  case T2822_LEFT:
		  case T2822_RIGHT:
		  case T2822_SEMI:
		  case T2822_COLON:
			++len;
			break;
		  case T2822_ATOM:
		  case T2822_QUOTED:
		  case T2822_LITERAL:
		  case T2822_COMMENT:
			if (t->sm_2822_type != T2822_ATOM)
				len += 2;
			for (j = 0; j < sm_str_getlen(t->sm_2822_val); ++j)
			{
				switch(ch = sm_str_rd_elem(t->sm_2822_val, j))
				{
				   case '"': case '[': case ']': case '(': case ')':
				   case '\\': case '\r': case '\n': ++len;
				   default: ++len;
				}
			}
			break;
		}
	}
	len += 2;
	if (sm_is_err(sm_str_space(sa, len)))
		return -1;
#endif /* 0 */


	prevtype = 0;
	for (i = 0; i < ta->sm_2822a_len; ++i)
	{
		t = ta->sm_2822a_t + i;
		curtype = t->sm_2822_type;
		if (needspace(prevtype, curtype))
		{
			if (sm_str_put(sa, (uchar) ' '))
				goto error;
		}
		prevtype = curtype;
		switch(curtype)
		{
		  case T2822_COMMA:
			if (sm_str_put(sa, (uchar) ','))
				goto error;
#if 0
			NSUW
#endif /* 0 */
			break;
		  case T2822_AT:
		  case T2822_DOT:
		  case T2822_LEFT:
		  case T2822_RIGHT:
		  case T2822_SEMI:
		  case T2822_COLON:
			if (sm_str_put(sa, (uchar) curtype))
				goto error;
			break;
		  case T2822_ATOM:
		  case T2822_QUOTED:
		  case T2822_LITERAL:
		  case T2822_COMMENT:
			if (t->sm_2822_type == T2822_QUOTED)
				if (sm_str_put(sa, (uchar) '"'))
					goto error;
			if (t->sm_2822_type == T2822_LITERAL)
				if (sm_str_put(sa, (uchar) '['))
					goto error;
			if (t->sm_2822_type == T2822_COMMENT)
				if (sm_str_put(sa, (uchar) '('))
					goto error;
			for (j = 0; j < sm_str_getlen(t->sm_2822_val); ++j)
			{
				switch(ch = sm_str_rd_elem(t->sm_2822_val, j))
				{
				  case '"':
				  case '[':
				  case ']':
				  case '(':
				  case ')':
				  case '\\':
				  case '\r':
				  case '\n':
					if (sm_str_put(sa, (uchar) '\\'))
						goto error;
					/* FALLTHROUGH */
				  default:
					if (sm_str_put(sa, (uchar) ch))
						goto error;
				}
			}
			if (t->sm_2822_type == T2822_QUOTED)
			{
				if (sm_str_put(sa, (uchar) '"'))
					goto error;
			}
			else if (t->sm_2822_type == T2822_LITERAL)
			{
				if (sm_str_put(sa, (uchar) ']'))
					goto error;
			}
			else if (t->sm_2822_type == T2822_COMMENT)
			{
				if (sm_str_put(sa, (uchar) ')'))
					goto error;
			}
			break;
		}
	}
#if 0
	NSUW
	--s;
#endif /* 0 */
	return 1;

 error:
	/* caller cleans up */
	return -1;
}

int t2822_unquote(sm_str_P sa, sm_2822_a_P ta)
{
	uint i, j;
	sm_2822_P t;
#if 0
	int len;
	uchar *s;
#endif /* 0 */

#if 0
/* calculate length */
	len = 0;
	for (i = 0; i < ta->sm_2822a_len; ++i)
	{
		t = ta->sm_2822a_t + i;
		switch(t->sm_2822_type)
		{
		  case T2822_COMMA:
		  case T2822_AT:
		  case T2822_DOT:
		  case T2822_LEFT:
		  case T2822_RIGHT:
		  case T2822_SEMI:
		  case T2822_COLON:
			++len;
			break;
		  case T2822_LITERAL:
			len += 2;
		  case T2822_ATOM:
		  case T2822_QUOTED:
			len += sm_str_getlen(t->sm_2822_val);
		}
	}

/* allocate string */
	if (sm_is_err(sm_str_space(sa, len)))
		return -1;

	s = sm_str_data(sa);
#endif /* 0 */

/* fill in */
	for (i = 0; i < ta->sm_2822a_len; ++i)
	{
		t = ta->sm_2822a_t + i;
		switch(t->sm_2822_type)
		{
		  case T2822_COMMA:
		  case T2822_AT:
		  case T2822_DOT:
		  case T2822_LEFT:
		  case T2822_RIGHT:
		  case T2822_SEMI:
		  case T2822_COLON:
			if (sm_str_put(sa, (uchar) (t->sm_2822_type)))
				goto error;
			break;
		  case T2822_ATOM:
		  case T2822_QUOTED:
		  case T2822_LITERAL:
			if (t->sm_2822_type == T2822_LITERAL)
			{
				if (sm_str_put(sa, (uchar) '['))
					goto error;
			}
			for (j = 0; j < sm_str_getlen(t->sm_2822_val); ++j)
			{
				if (sm_str_put(sa,
						sm_str_rd_elem(t->sm_2822_val,
								j)))
					goto error;
			}
			if (t->sm_2822_type == T2822_LITERAL)
			{
				if (sm_str_put(sa, (uchar) ']'))
					goto error;
			}
			break;
		  case T2822_COMMENT:
			break;
		}
	}
#if 0
	sa->sm_str_len = s - sm_str_data(sa);
#endif /* 0 */
	return 1;

  error:
	return -1;
}

sm_ret_T
t2822_parse(sm_2822_a_T *ta, sm_str_P sa)
{
	int i, salen, level, r;
	int numtoks, numchars;
	sm_2822_P t;
#if 0
	uchar *cbuf;
#endif /* 0 */

	salen = sm_str_getlen(sa);

/*ca
pass 1: figure out how many tokens (numtoks) and huch much space (numchars)
it takes to store this address, also do syntax check.
*/
	numchars = 0;
	numtoks = 0;
	for (i = 0; i < salen; ++i)
	{
		switch(sm_str_rd_elem(sa, i))
		{
		  case '.': case ',': case '@': case '<': case '>': case ':': case ';':
			++numtoks; break;
		  case ' ': case '\t': case '\r': case '\n': break;
		  case ')': case ']': return 0;
		/* other control chars and non-ASCII chars are also bad, in theory */
		  case '(':
			level = 1;
			while (level)
			{
				if (++i >= salen) return 0;
				switch(sm_str_rd_elem(sa, i))
				{
				   case '(': ++level; break;
				   case ')': --level; break;
				   case '\\': if (++i >= salen) return 0;
					/* FALLTHROUGH */
				   default: ++numchars;
				}
			}
			++numtoks;
			break;

/*ca same code as above, but without nesting */
		  case '"':
			level = 1;
			while (level)
			{
				if (++i >= salen) return 0;
				switch(sm_str_rd_elem(sa, i))
				{
				  case '"': --level; break;
				  case '\\': if (++i >= salen) return 0;
					/* FALLTHROUGH */
				  default: ++numchars;
				}
			}
			++numtoks;
			break;
/*ca same code as above; [...] is taken as one token, not broken apart */
		  case '[':
			level = 1;
			while (level)
			{
				if (++i >= salen) return 0;
				switch(sm_str_rd_elem(sa, i))
				{
				  case ']': --level; break;
				  case '\\': if (++i >= salen) return 0;
					/* FALLTHROUGH */
				  default: ++numchars;
				}
			}
			++numtoks;
			break;
		default:
			do
			{
				if (sm_str_rd_elem(sa, i) == '\\')
					if (++i >= salen)
						break;
				++numchars;
				if (++i >= salen)
					break;
			} while (atomtype(sm_str_rd_elem(sa, i)) > 0);
			--i;
			++numtoks;
		}
	}

/*ca allocate space for tokens and chars */
	if (!t2822_ready(ta, numtoks))
		return -1;
#if 0
	buf = (uchar *) sm_malloc(sizeof(*buf) * numchars);
	if (buf == NULL)
		return -1;
/*ca
note: cbuf is used to store the strings, i.e., no further allocs
are required below. nice trick.
*/
	cbuf = buf;
#endif /* 0 */
	ta->sm_2822a_len = numtoks;

	t = ta->sm_2822a_t;
	for (i = 0; i < salen; ++i)
	{
		switch(sm_str_rd_elem(sa, i))
		{
		  case '.':
		  case ',':
		  case '@':
		  case '<':
		  case '>':
		  case ':':
		  case ';':
			t->sm_2822_type = sm_str_rd_elem(sa, i);
			t->sm_2822_val = NULL;
			++t;
			break;
		/* error */
		  case ' ': case '\t': case '\r': case '\n': break;
		  case ')':
		  case ']':
			goto error;
		  case '(':
			t->sm_2822_type = T2822_COMMENT;
			t->sm_2822_val = sm_str_new(NULL, 10, 1024);
			if (t->sm_2822_val == NULL)
				goto error;
			level = 1;
			while (level)
			{
				++i; /* assert: < salen */
				switch(sm_str_rd_elem(sa, i))
				{
				   case '(': ++level; break;
				   case ')': --level; break;
				   case '\\': ++i; /* assert: < salen */
					/* FALLTHROUGH */
				   default:
					if (sm_str_put(t->sm_2822_val,
							sm_str_rd_elem(sa, i)))
						goto error;
				}
			}
			++t;
			break;
		  case '"':
			t->sm_2822_type = T2822_QUOTED;
			t->sm_2822_val = sm_str_new(NULL, 10, 1024);
			if (t->sm_2822_val == NULL)
				goto error;
			level = 1;
			while (level)
			{
				++i; /* assert: < salen */
				switch(sm_str_rd_elem(sa, i))
				{
				  case '"': --level; break;
				  case '\\': ++i; /* assert: < salen */
					/* FALLTHROUGH */
				  default:
					if (sm_str_put(t->sm_2822_val,
							sm_str_rd_elem(sa, i)))
						goto error;
				}
			}
			++t;
			break;
		  case '[':
			t->sm_2822_type = T2822_LITERAL;
			t->sm_2822_val = sm_str_new(NULL, 10, 1024);
			if (t->sm_2822_val == NULL)
				goto error;
			level = 1;
			while (level)
			{
				++i; /* assert: < salen */
				switch(sm_str_rd_elem(sa, i))
				{
				  case ']': --level; break;
				  case '\\': ++i; /* assert: < salen */
					/* FALLTHROUGH */
				  default:
					if (sm_str_put(t->sm_2822_val,
							sm_str_rd_elem(sa, i)))
						goto error;
				}
			}
			++t;
			break;
		default:
			t->sm_2822_type = T2822_ATOM;
			t->sm_2822_val = sm_str_new(NULL, 10, 1024);
			if (t->sm_2822_val == NULL)
				goto error;
			do
			{
				if (sm_str_rd_elem(sa, i) == '\\')
					if (++i >= salen)
						break;
				if (sm_str_put(t->sm_2822_val,
						sm_str_rd_elem(sa, i)))
					goto error;
				if (++i >= salen)
					break;
				r = atomtype(sm_str_rd_elem(sa, i));
				if (r == T2822_QUOTED)
					t->sm_2822_type = r;
			} while (r > 0);
			--i;
			++t;
		}
	}
	return 1;

  error:
	/* should cleanup ta, need to remember old value, free new ones */
	return -1;
}

static int
gotaddr(sm_2822_a_P taout, sm_2822_a_P taaddr)
{
	uint i;

	if (!t2822_readyplus(taout, taaddr->sm_2822a_len))
		return 0;
	for (i = 0; i < taaddr->sm_2822a_len; ++i)
		taout->sm_2822a_t[taout->sm_2822a_len++] =
					taaddr->sm_2822a_t[i];
	taaddr->sm_2822a_len = 0;
	return 1;
}

int
t2822_addrlist(sm_2822_a_P taout, sm_2822_a_P taaddr, sm_2822_a_P ta)
{
	bool ingroup, wordok;
	sm_2822_P t;
	sm_2822_P beginning;

	taout->sm_2822a_len = 0;
	taaddr->sm_2822a_len = 0;
	if (!t2822_readyplus(taout, 1))
		return -1;
	if (!t2822_readyplus(taaddr, 1))
		return -1;
	ingroup = false;
	wordok = true;
	beginning = ta->sm_2822a_t + 2;
	t = ta->sm_2822a_t + ta->sm_2822a_len - 1;

	/* rfc 822 address lists are easy to parse from right to left */

#define FLUSH if (taaddr->sm_2822a_len) if (!gotaddr(taout, taaddr)) return -1;
#define FLUSHCOMMA if (taaddr->sm_2822a_len) { \
	if (!gotaddr(taout, taaddr)) return -1; \
	if (!t2822_append(taout, &comma)) return -1; }
#define ADDRLEFT if (!t2822_append(taaddr, t--)) return -1;
#define OUTLEFT if (!t2822_append(taout, t--)) return -1;

	while (t >= beginning)
	{
		switch(t->sm_2822_type)
		{
		  case T2822_SEMI:
			FLUSHCOMMA
			if (ingroup)
				return 0;
			ingroup = true;
			wordok = true;
			break;
		  case T2822_COLON:
			FLUSH
			if (!ingroup)
				return 0;
			ingroup = false;
			while ((t >= beginning) && (t->sm_2822_type != T2822_COMMA))
			 OUTLEFT
			if (t >= beginning)
			 OUTLEFT
			wordok = true;
			continue;
		  case T2822_RIGHT:
			FLUSHCOMMA
			OUTLEFT
			while ((t >= beginning) && (t->sm_2822_type != T2822_LEFT))
			 ADDRLEFT
			/* important to use address here even if it's empty: <> */
			if (!gotaddr(taout, taaddr))
				return -1;
			if (t < beginning)
				return 0;
			OUTLEFT
			while ((t >= beginning) && ((t->sm_2822_type == T2822_COMMENT) || (t->sm_2822_type == T2822_ATOM) || (t->sm_2822_type == T2822_QUOTED) || (t->sm_2822_type == T2822_AT) || (t->sm_2822_type == T2822_DOT)))
			 OUTLEFT
			wordok = false;
			continue;
		  case T2822_ATOM: case T2822_QUOTED: case T2822_LITERAL:
			if (!wordok)
			 FLUSHCOMMA
			wordok = false;
			ADDRLEFT
			continue;
		  case T2822_COMMENT:
			/* comment is lexically a space; shouldn't affect wordok */
			break;
		  case T2822_COMMA:
			FLUSH
			wordok = true;
			break;
		  default:
			wordok = true;
			ADDRLEFT
			continue;
		}
		OUTLEFT
	}
	FLUSH
	++t;
	while (t > ta->sm_2822a_t)
	{
		if (!t2822_append(taout, --t))
			return -1;
	}

	t2822_reverse(taout);
	return 1;
}


syntax highlighted by Code2HTML, v. 0.9.1