/*
 * Copyright (c) 2004, 2006 Sendmail, Inc. and its suppliers.
 *	All rights reserved.
 *
 * By using this file, you agree to the terms and conditions set
 * forth in the LICENSE file which can be found at the top level of
 * the sendmail distribution.
 */

#include "sm/generic.h"
SM_RCSID("@(#)$Id: sm-conf-token.c,v 1.14 2006/01/09 19:06:25 ca Exp $")

#if SM_LIBCONF_ALONE
#include <limits.h>
#include <string.h>
#include <assert.h>
#include <errno.h>
#include <ctype.h>
#include <stdio.h>
#include "sm-conf.h"
#include "sm-util.h"
#else /* SM_LIBCONF_ALONE */
#include "sm/limits.h"
#include "sm/string.h"
#include "sm/assert.h"
#include "sm/error.h"
#include "sm/memops.h"
#include "sm/ctype.h"
#include <stdio.h>
#include "sm/sm-conf.h"
#endif /* SM_LIBCONF_ALONE */

#include "sm-conf-token.h"
#include "sm-conf-state.h"

/* SM-CONF-TOKEN.C -- tokenizer. */

#define IS_ATOM_CHAR(r)			\
	(  !isascii(r)			\
	|| (  isgraph(r)		\
	   && (r) != '{'		\
	   && (r) != '}'		\
	   && (r) != ','		\
	   && (r) != ';'		\
	   && (r) != '='		\
	   && (r) != '#'		\
	   && (r) != '"'))

#define IS_NEWLINE(x, e)		\
	((x) < e && (*(x) == '\r' || *(x) == '\n'))

#define NEWLINE_END(r, e)		\
	((r) + 1 + (  ((r) + 1 < e)	\
		   && (r)[1] != (r)[0]	\
		   && ((r)[1] == '\n' || (r)[1] == '\r')))

/*
**  SM_CONF_LOOKAHEAD -- look ahead to the next token
**
**	Parameters:
**		smc -- a buffered configuration file.
**
**	Returns:
**		SM_CONF_TOKEN_EOF on running out of input,
**		SM_CONF_TOKEN_ERROR on error,
**		otherwise the next non-whitespace, non-comment token.
**
**	Side Effects:
**		none.
**
*/

enum sm_conf_token_type_E
sm_conf_token_lookahead(sm_conf_T const *smc)
{
	char const *e, *s;

	if (smc == NULL || smc->smc_buf_i >= smc->smc_buf_n)
		return SM_CONF_TOKEN_EOF;

	s = smc->smc_buf + smc->smc_buf_i;
	e = smc->smc_buf + smc->smc_buf_n;

	/* Skip white space and comments. */
	while (s < e)
	{
		if (*s == '#')
		{
			while (s < e && !IS_NEWLINE(s, e))
				s++;
			s = NEWLINE_END(s, e);
		}
		else if (ISSPACE(*s))
			s++;
		else
			break;
	}
	if (s >= e)
		return SM_CONF_TOKEN_EOF;

	switch (*s)
	{
	  case '"':
		return SM_CONF_TOKEN_STRING;

	  case '}':
	  case '{':
	  case ';':
	  case ',':
	  case '=':
		return *s;

	 default:
		break;
	}

	if (!IS_ATOM_CHAR(*s))
		return SM_CONF_TOKEN_ERROR;

	return SM_CONF_TOKEN_ATOM;
}


/*
**  SM_CONF_SKIP_SPACE -- advance pointer to the beginning of the next token
**
**	Parameters:
**		smc -- a buffered configuration file.
**
**	Returns:
**		none
**
**	Side Effects:
**		advance buffer pointer and, if newlines are passed,
**		line number.
**
*/

static void
sm_conf_skip_space(sm_conf_T *smc)
{
	char	*e, *s;

	if (smc == NULL || smc->smc_buf_i >= smc->smc_buf_n)
		return;

	s = smc->smc_buf + smc->smc_buf_i;
	e = smc->smc_buf + smc->smc_buf_n;

	/* Skip white space and comments. */
	while (s < e)
	{
		if (*s == '#')
		{
			while (s < e && !IS_NEWLINE(s, e))
				s++;
			if (s < e)
				smc->smc_line++;
			s = NEWLINE_END(s, e);
		}
		else if (ISSPACE(*s))
		{
			if (IS_NEWLINE(s, e))
			{
				s = NEWLINE_END(s, e);
				smc->smc_line++;
			}
			else
				s++;
		}
		else
			break;
	}
	smc->smc_buf_i = s - smc->smc_buf;
}


static int
sm_conf_token_hexvalue(unsigned char c)
{
	if (ISDIGIT(c))
		return c - '0';
	switch (c)
	{
	  case 'A':
	  case 'a':
		return 0xA;
	  case 'B':
	  case 'b':
		return 0xB;
	  case 'C':
	  case 'c':
		return 0xC;
	  case 'D':
	  case 'd':
		return 0xD;
	  case 'E':
	  case 'e':
		return 0xE;
	  case 'F':
	  case 'f':
		return 0xF;
	}

	return -1;
}

static enum SM_CONF_ERROR_E
sm_conf_token_ulong_to_utf8(unsigned long ch, char **out)
{
	unsigned int     i, nbits;

	if (ch <= 127)
	{
		*(*out)++ = (unsigned char)ch;
		return 0;
	}

	for (i = 2, nbits = 6 + 5; i <= 6; i++, nbits += 5)
		if (ch < (unsigned long)1 << nbits)
			break;

	if (i > 6)
		return SM_CONF_ERR_CHAR_OVERFLOW;

	**out  = (unsigned char)((unsigned char)-1 << (8 - i));
	**out |= (unsigned char)(ch >> (nbits / 6 * 6));
	(*out)++;

	nbits -= nbits % 6;

	while (--i > 0)
	{
		*(*out)++ = (unsigned char)(0x80 | (0x3F & (ch >> nbits)));
		nbits -= 6;
	}
	return 0;
}

enum SM_CONF_ERROR_E
sm_conf_token_backslash(
	char			**s_ptr,
	char const		**r_ptr,
	char const		*e)
{
	unsigned long		ul = 0;
	size_t			i;
	enum SM_CONF_ERROR_E	err;
	char			*s = *s_ptr;
	char const		*r = *r_ptr;

	err = 0;
	ul = 0;
	i = 4;

	if (r >= e)
		err = SM_CONF_ERR_EOF_IN_STRING;

	else if (IS_NEWLINE(r, e))
		err = SM_CONF_ERR_NEWLINE_IN_STRING;

	else
	{
		switch (*r++)
		{
		  case 'a':
			*s++ = '\a';
			break;
		  case 'b':
			*s++ = '\b';
			break;
		  case 'f':
			*s++ = '\f';
			break;
		  case 'n':
			*s++ = '\n';
			break;
		  case 'r':
			*s++ = '\r';
			break;
		  case 't':
			*s++ = '\t';
			break;
		  case 'v':
			*s++ = '\v';
			break;

		  case 'x':
			/* \xNNNNNN... */
			if (r >= e)
			{
				*s_ptr = s;
				*r_ptr = r;
				return SM_CONF_ERR_EOF_IN_STRING;
			}
			if (!ISXDIGIT(*r))
			{
				r++;
				err = SM_CONF_ERR_HEX_EXPECTED;
				break;
			}

			while (r < e && ISXDIGIT(*r))
			{
				ul = (ul << 4) | sm_conf_token_hexvalue(*r++);
				if ((ul >> CHAR_BIT) != 0)
				{
					err = SM_CONF_ERR_CHAR_OVERFLOW;
					break;
				}
			}
			if (err == 0)
				*s++ = (unsigned char)ul;
			break;

		  case 'U':
			i = 8;
		  case 'u':
			/* \u1234, \U12345678 */

			while (i-- > 0)
			{
				if (r > e)
				{
					r = e;
					err = SM_CONF_ERR_EOF_IN_STRING;
					break;
				}
				if (!ISXDIGIT(*r))
				{
					err = SM_CONF_ERR_HEX_EXPECTED;
					r++;
					break;
				}
				ul = (ul << 4) | sm_conf_token_hexvalue(*r++);
			}

			if (err == 0)
				/* UTF-8 encode <ul> */
				err = sm_conf_token_ulong_to_utf8(ul, &s);
			break;

		  default:
			r--;
			if (*r < '0' || *r > '7')
				*s++ = *r++;
			else
			{
				/* \oct [oct [oct]] */

				ul = *r++ - '0';
				if (r < e && *r >= '0' && *r <= '7')
				{
					ul = (ul << 3) | (*r++ - '0');
					if (r < e && *r >= '0' && *r <= '7')
						ul = (ul << 3) | (*r++ - '0');
				}
				if ((ul >> CHAR_BIT) > 0)
				{
					err = SM_CONF_ERR_CHAR_OVERFLOW;
					break;
				}
				*s++ = ul;
			}
			break;
		}
	}

	if (err == 0)
		*s_ptr = s;
	else
	{
		if (r > *r_ptr)
			/* copy the spelling of the problematic token */
			sm_memcpy(*s_ptr, *r_ptr, r - *r_ptr);
		*s_ptr += r - *r_ptr;
	}
	*r_ptr = r;
	return err;
}

/*
**  SM_CONF_TOKEN -- read a token or token group from the input stream
**
**	If nothing goes wrong, <tok> is filled with pointers to
**	the contents of the next token on the input stream.
**
**	On correct encoding, the return value and the tok->sct_type
**	match, and tok->sct_text and tok->sct_text_n delineate
**	the (decoded, if necessary) contents of the token.
**	The returned value data is not '\0'-terminated; instead,
**	length values are provided.
**
**	The input data buffer is overwritten with decoded tokens
**	as this happens.
**
**	Parameter:
**		smc -- stream to read the token from
**		tok -- token structure to fill
**		flags -- details about what token to read:
**			SM_CONF_TOKEN_FLAG_IDENTIFIER
**				-- don't combine atoms
**
**	Returns:
**		a token type > 0 otherwise.
**		SM_CONF_TOKEN_ERROR on error (and tok->sct_error
**			contains the specific error code)
**		SM_CONF_TOKEN_EOF on EOF (and tok->smtc_type is
**			also SM_CONF_TOKEN_EOF)
*/

enum sm_conf_token_type_E
sm_conf_token(sm_conf_T *smc, sm_conf_token_T *tok, unsigned int flags)
{
	char const			*r;
	char				*e, *s;
	enum SM_CONF_ERROR_E		err;

	SM_IS_CONF_TOKEN(tok);

	if (smc == NULL)
		return tok->sct_type = SM_CONF_TOKEN_EOF;

	sm_conf_skip_space(smc);
	s = smc->smc_buf + smc->smc_buf_i;
	e = smc->smc_buf + smc->smc_buf_n;
	tok->sct_line = smc->smc_line;
	tok->sct_text = s;

	if (s >= e)
	{
		smc->smc_buf_i = s - smc->smc_buf;
		return tok->sct_type = SM_CONF_TOKEN_EOF;
	}

	switch (*s)
	{
	  case '"':
		tok->sct_type = SM_CONF_TOKEN_STRING;

		/* quote-delimited string; unquote on the fly */
		for (r = s + 1; r < e;)
		{
			char *s0;

			if (IS_NEWLINE(r, e))
			{
				tok->sct_text_n = s - tok->sct_text;
				tok->sct_error = SM_CONF_ERR_NEWLINE_IN_STRING;

				r = NEWLINE_END(r, e);
				smc->smc_buf_i = r - smc->smc_buf;
				smc->smc_line++;

				return SM_CONF_TOKEN_ERROR;
			}
			else if (*r == '"')
			{
				r++;
				smc->smc_buf_i = r - smc->smc_buf;
				if (sm_conf_token_lookahead(smc)
				   == SM_CONF_TOKEN_STRING)
				{
					/* concatenate adjacent strings */
					sm_conf_skip_space(smc);
					r = smc->smc_buf + smc->smc_buf_i + 1;
					assert(r <= e);

					/* rescan first string char */
					continue;
				}
				else
					break;
			}

			if (*r != '\\')
			{
				*s++ = *r++;
				continue;
			}

			if (r + 1 >= e)
			{
				smc->smc_buf_i	= smc->smc_buf_n;
				tok->sct_text_n = s - tok->sct_text;
				tok->sct_error  = SM_CONF_ERR_EOF_IN_STRING;

				return SM_CONF_TOKEN_ERROR;
			}
			r++;

			s0 = s;
			err = sm_conf_token_backslash(&s, &r, e);
			if (err != 0)
			{
				/* reinsert the \ */
				if (s > s0)
					sm_memmove(s0 + 1, s0, s - s0);
				*s0 = '\\';
				s++;

				smc->smc_buf_i  = r - smc->smc_buf;
				tok->sct_text_n = s - tok->sct_text;
				tok->sct_error = err;

				return SM_CONF_TOKEN_ERROR;
			}
		}

		tok->sct_text_n = s - tok->sct_text;
		if (r >= e)
		{
			smc->smc_buf_i = r - smc->smc_buf;
			tok->sct_error = SM_CONF_ERR_EOF_IN_STRING;
			return SM_CONF_TOKEN_ERROR;
		}
		smc->smc_buf_i = r - smc->smc_buf;
		break;

	  case '}':
	  case '{':
	  case ';':
	  case ',':
	  case '=':
		/* single-character token. */
		smc->smc_buf_i++;
		tok->sct_text_n = 1;
		return tok->sct_type = *tok->sct_text;

	 default:
		if (!IS_ATOM_CHAR(*s))
		{
			smc->smc_buf_i++;
			tok->sct_text_n = 1;
			tok->sct_type = SM_CONF_TOKEN_ATOM;
			tok->sct_error = SM_CONF_ERR_BAD_CHAR;

			return SM_CONF_TOKEN_ERROR;
		}

		for (s++; s < e && IS_ATOM_CHAR(*s); s++)
			;
		smc->smc_buf_i = s - smc->smc_buf;
		tok->sct_type = SM_CONF_TOKEN_ATOM;

		break;
	}

	/*
	**  If we arrive here, we read either a string or an atom.
	**  <smc->smc_buf_i> has been updated and points just past
	**  what we read.  <s> points just past the token contents so far.
	*/

	if ((flags & SM_CONF_TOKEN_FLAG_IDENTIFIER) == 0)
	{
		enum sm_conf_token_type_E la;

		/*
		**  Since we're not parsing just an identifier,
		**  combine trailing text tokens (strings or atoms)
		**  with the one we just read.
		*/

		while (  (la = sm_conf_token_lookahead(smc))
			    == SM_CONF_TOKEN_STRING
		      || la == SM_CONF_TOKEN_ATOM)
		{
			sm_conf_token_T	next;
			int		have_space;

			/*
			**  If there is white space between here and our next
			**  token start, add a single space character.
			**
			**  Note that adjacent quoted strings are joined
			**  by the string parser and will *not* get space
			**  between them.
			*/

			have_space = smc->smc_buf_i < smc->smc_buf_n
				  && isascii(smc->smc_buf[smc->smc_buf_i])
				  && (  isspace(smc->smc_buf[smc->smc_buf_i])
				      || smc->smc_buf[smc->smc_buf_i] == '#');

			sm_memzero(&next, sizeof(next));
			next.sm_magic = SM_CONF_TOKEN_MAGIC;

			la = sm_conf_token(smc, &next, flags);
			if (  la != SM_CONF_TOKEN_ATOM
			   && la != SM_CONF_TOKEN_STRING)
			{
				SM_ASSERT(la == SM_CONF_TOKEN_ERROR);
				SM_ASSERT(next.sct_error != 0);

				tok->sct_error  = next.sct_error;
				tok->sct_text   = next.sct_text;
				tok->sct_text_n = next.sct_text_n;

				return la;
			}

			/*
			**  Append the next token to this one, separated
			**  by white space as needed.
			*/

			if (have_space)
				*s++ = ' ';
			sm_memmove(s, next.sct_text, next.sct_text_n);
			s += next.sct_text_n;

			/*
			**  Even if this started out as a string,
			**  by now it's just a (mixed) atom.
			*/

			tok->sct_type = SM_CONF_TOKEN_ATOM;
		}
	}
	tok->sct_text_n = s - tok->sct_text;

	return tok->sct_type;
}

static char *
sm_conf_token_string_char(char *w, int ch)
{
	if (ISGRAPH(ch) || ch == ' ')
	{
		if (ch == '"')
			*w++ = '\\';
		*w++ = ch;
	}
	else
	{
		switch (ch)
		{
		  case '\0':
			*w++ = '\\';
			*w++ = '0';
			break;
		  case '\a':
			*w++ = '\\';
			*w++ = 'a';
			break;
		  case '\b':
			*w++ = '\\';
			*w++ = 'b';
			break;
		  case '\f':
			*w++ = '\\';
			*w++ = 'f';
			break;
		  case '\n':
			*w++ = '\\';
			*w++ = 'n';
			break;
		  case '\r':
			*w++ = '\\';
			*w++ = 'r';
			break;
		  case '\t':
			*w++ = '\\';
			*w++ = 't';
			break;
		  case '\v':
			*w++ = '\\';
			*w++ = 'v';
			break;
		  default:
			snprintf(w, 5, "\\%3.3o", (unsigned char)ch);
			w += 4;
		}
	}
	return w;
}

char const *
sm_conf_token_string(
	sm_conf_token_T *tok,
	char		*buf,
	size_t		bufsize)
{
	size_t		n;
	char		*w;

	if (tok == NULL)
		return "(null)";

	switch (tok->sct_type)
	{
	  case SM_CONF_TOKEN_ERROR:
		return "scanner error";

	  case SM_CONF_TOKEN_EOF:
		return "EOF";

	  case SM_CONF_TOKEN_NONE:
		return "internal error";

	  case SM_CONF_TOKEN_STRING:
		if (bufsize <= 8)
			return "string";

		n = 0;
		w = buf;
		*w++ = '"';
		while ((buf + bufsize) - w >= 8 && n < tok->sct_text_n)
			w = sm_conf_token_string_char(w, tok->sct_text[n++]);
		if (n < tok->sct_text_n)
		{
			*w++ = '.';
			*w++ = '.';
			*w++ = '.';
		}
		*w++ = '"';
		*w = '\0';
		return buf;

	  case SM_CONF_TOKEN_ATOM:
		if (bufsize <= 4)
			return "atom";

		snprintf(buf, bufsize, "'%.*s'",
			(int)tok->sct_text_n,
			tok->sct_text);
		return buf;

	  case SM_CONF_TOKEN_SEMI:
		return "';'";

	  case SM_CONF_TOKEN_OBRACE:
		return "'{'";

	  case SM_CONF_TOKEN_CBRACE:
		return "'}'";

	  case SM_CONF_TOKEN_COMMA:
		return "','";

	  case SM_CONF_TOKEN_EQUAL:
		return "'='";

	  default:
		break;
	}
	snprintf(buf, bufsize, "<unexpected token %d>", tok->sct_type);
	return buf;
}


/*
**  SM_CONF_TOKEN_MATCH -- match a tokenized string against data
**
**	Compare a user-specified string with a name as recorded in the data.
**	Evaluate \ in the user data without allocating a separate copy.
**
**	Parameters:
**		smc -- handle of the environment in which all this happens
**		user -- user-specified string (interpreted like
**			tokens in a file)
**		user_n -- # of bytes pointed to by <user>
**		data -- internal string
**		data_n -- # of bytes pointed to by <data>
**
**	Returns:
**		1 if the two strings match, 0 if they don't or on
**		encoding error.
*/

int
sm_conf_token_match(
	sm_conf_T		*smc,
	char const		*user,
	size_t			user_n,
	char const		*data,
	size_t			data_n)
{
	char const		*user_e;
	char const		*data_e;
	char const		*r;
	char			buf[10], *buf_ptr;

	if (user == NULL && data == NULL)
		return 1;
	if (user == NULL || data == NULL)
		return 0;

	user_e = user + user_n;
	data_e = data + data_n;

	while (user < user_e)
	{
		if (*user == '\\')
		{
			if (user + 1 >= user_e)
				return 0;

			user++;
			buf_ptr = buf;
			if (sm_conf_token_backslash(&buf_ptr, &user, user_e))
				return 0;

			for (r = buf; r < buf_ptr; r++)
			{
				if (data >= data_e)
					return 0;

				if ( isascii(*r)
				   ? tolower(*r) != tolower(*data)
				   : *r != *data)
					return 0;
				data++;
			}
		}
		else
		{
			if ( isascii(*user)
			   ? tolower(*user) != tolower(*data)
			   : *user != *data)
				return 0;

			user++;
			data++;
		}
	}

	return data == data_e;
}