/*
* Copyright (c) 2004, 2006 Sendmail, Inc. and its suppliers.
* All rights reserved.
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the sendmail distribution.
*/
#include "sm/generic.h"
SM_RCSID("@(#)$Id: sm-conf-token.c,v 1.14 2006/01/09 19:06:25 ca Exp $")
#if SM_LIBCONF_ALONE
#include <limits.h>
#include <string.h>
#include <assert.h>
#include <errno.h>
#include <ctype.h>
#include <stdio.h>
#include "sm-conf.h"
#include "sm-util.h"
#else /* SM_LIBCONF_ALONE */
#include "sm/limits.h"
#include "sm/string.h"
#include "sm/assert.h"
#include "sm/error.h"
#include "sm/memops.h"
#include "sm/ctype.h"
#include <stdio.h>
#include "sm/sm-conf.h"
#endif /* SM_LIBCONF_ALONE */
#include "sm-conf-token.h"
#include "sm-conf-state.h"
/* SM-CONF-TOKEN.C -- tokenizer. */
#define IS_ATOM_CHAR(r) \
( !isascii(r) \
|| ( isgraph(r) \
&& (r) != '{' \
&& (r) != '}' \
&& (r) != ',' \
&& (r) != ';' \
&& (r) != '=' \
&& (r) != '#' \
&& (r) != '"'))
#define IS_NEWLINE(x, e) \
((x) < e && (*(x) == '\r' || *(x) == '\n'))
#define NEWLINE_END(r, e) \
((r) + 1 + ( ((r) + 1 < e) \
&& (r)[1] != (r)[0] \
&& ((r)[1] == '\n' || (r)[1] == '\r')))
/*
** SM_CONF_LOOKAHEAD -- look ahead to the next token
**
** Parameters:
** smc -- a buffered configuration file.
**
** Returns:
** SM_CONF_TOKEN_EOF on running out of input,
** SM_CONF_TOKEN_ERROR on error,
** otherwise the next non-whitespace, non-comment token.
**
** Side Effects:
** none.
**
*/
enum sm_conf_token_type_E
sm_conf_token_lookahead(sm_conf_T const *smc)
{
char const *e, *s;
if (smc == NULL || smc->smc_buf_i >= smc->smc_buf_n)
return SM_CONF_TOKEN_EOF;
s = smc->smc_buf + smc->smc_buf_i;
e = smc->smc_buf + smc->smc_buf_n;
/* Skip white space and comments. */
while (s < e)
{
if (*s == '#')
{
while (s < e && !IS_NEWLINE(s, e))
s++;
s = NEWLINE_END(s, e);
}
else if (ISSPACE(*s))
s++;
else
break;
}
if (s >= e)
return SM_CONF_TOKEN_EOF;
switch (*s)
{
case '"':
return SM_CONF_TOKEN_STRING;
case '}':
case '{':
case ';':
case ',':
case '=':
return *s;
default:
break;
}
if (!IS_ATOM_CHAR(*s))
return SM_CONF_TOKEN_ERROR;
return SM_CONF_TOKEN_ATOM;
}
/*
** SM_CONF_SKIP_SPACE -- advance pointer to the beginning of the next token
**
** Parameters:
** smc -- a buffered configuration file.
**
** Returns:
** none
**
** Side Effects:
** advance buffer pointer and, if newlines are passed,
** line number.
**
*/
static void
sm_conf_skip_space(sm_conf_T *smc)
{
char *e, *s;
if (smc == NULL || smc->smc_buf_i >= smc->smc_buf_n)
return;
s = smc->smc_buf + smc->smc_buf_i;
e = smc->smc_buf + smc->smc_buf_n;
/* Skip white space and comments. */
while (s < e)
{
if (*s == '#')
{
while (s < e && !IS_NEWLINE(s, e))
s++;
if (s < e)
smc->smc_line++;
s = NEWLINE_END(s, e);
}
else if (ISSPACE(*s))
{
if (IS_NEWLINE(s, e))
{
s = NEWLINE_END(s, e);
smc->smc_line++;
}
else
s++;
}
else
break;
}
smc->smc_buf_i = s - smc->smc_buf;
}
static int
sm_conf_token_hexvalue(unsigned char c)
{
if (ISDIGIT(c))
return c - '0';
switch (c)
{
case 'A':
case 'a':
return 0xA;
case 'B':
case 'b':
return 0xB;
case 'C':
case 'c':
return 0xC;
case 'D':
case 'd':
return 0xD;
case 'E':
case 'e':
return 0xE;
case 'F':
case 'f':
return 0xF;
}
return -1;
}
static enum SM_CONF_ERROR_E
sm_conf_token_ulong_to_utf8(unsigned long ch, char **out)
{
unsigned int i, nbits;
if (ch <= 127)
{
*(*out)++ = (unsigned char)ch;
return 0;
}
for (i = 2, nbits = 6 + 5; i <= 6; i++, nbits += 5)
if (ch < (unsigned long)1 << nbits)
break;
if (i > 6)
return SM_CONF_ERR_CHAR_OVERFLOW;
**out = (unsigned char)((unsigned char)-1 << (8 - i));
**out |= (unsigned char)(ch >> (nbits / 6 * 6));
(*out)++;
nbits -= nbits % 6;
while (--i > 0)
{
*(*out)++ = (unsigned char)(0x80 | (0x3F & (ch >> nbits)));
nbits -= 6;
}
return 0;
}
enum SM_CONF_ERROR_E
sm_conf_token_backslash(
char **s_ptr,
char const **r_ptr,
char const *e)
{
unsigned long ul = 0;
size_t i;
enum SM_CONF_ERROR_E err;
char *s = *s_ptr;
char const *r = *r_ptr;
err = 0;
ul = 0;
i = 4;
if (r >= e)
err = SM_CONF_ERR_EOF_IN_STRING;
else if (IS_NEWLINE(r, e))
err = SM_CONF_ERR_NEWLINE_IN_STRING;
else
{
switch (*r++)
{
case 'a':
*s++ = '\a';
break;
case 'b':
*s++ = '\b';
break;
case 'f':
*s++ = '\f';
break;
case 'n':
*s++ = '\n';
break;
case 'r':
*s++ = '\r';
break;
case 't':
*s++ = '\t';
break;
case 'v':
*s++ = '\v';
break;
case 'x':
/* \xNNNNNN... */
if (r >= e)
{
*s_ptr = s;
*r_ptr = r;
return SM_CONF_ERR_EOF_IN_STRING;
}
if (!ISXDIGIT(*r))
{
r++;
err = SM_CONF_ERR_HEX_EXPECTED;
break;
}
while (r < e && ISXDIGIT(*r))
{
ul = (ul << 4) | sm_conf_token_hexvalue(*r++);
if ((ul >> CHAR_BIT) != 0)
{
err = SM_CONF_ERR_CHAR_OVERFLOW;
break;
}
}
if (err == 0)
*s++ = (unsigned char)ul;
break;
case 'U':
i = 8;
case 'u':
/* \u1234, \U12345678 */
while (i-- > 0)
{
if (r > e)
{
r = e;
err = SM_CONF_ERR_EOF_IN_STRING;
break;
}
if (!ISXDIGIT(*r))
{
err = SM_CONF_ERR_HEX_EXPECTED;
r++;
break;
}
ul = (ul << 4) | sm_conf_token_hexvalue(*r++);
}
if (err == 0)
/* UTF-8 encode <ul> */
err = sm_conf_token_ulong_to_utf8(ul, &s);
break;
default:
r--;
if (*r < '0' || *r > '7')
*s++ = *r++;
else
{
/* \oct [oct [oct]] */
ul = *r++ - '0';
if (r < e && *r >= '0' && *r <= '7')
{
ul = (ul << 3) | (*r++ - '0');
if (r < e && *r >= '0' && *r <= '7')
ul = (ul << 3) | (*r++ - '0');
}
if ((ul >> CHAR_BIT) > 0)
{
err = SM_CONF_ERR_CHAR_OVERFLOW;
break;
}
*s++ = ul;
}
break;
}
}
if (err == 0)
*s_ptr = s;
else
{
if (r > *r_ptr)
/* copy the spelling of the problematic token */
sm_memcpy(*s_ptr, *r_ptr, r - *r_ptr);
*s_ptr += r - *r_ptr;
}
*r_ptr = r;
return err;
}
/*
** SM_CONF_TOKEN -- read a token or token group from the input stream
**
** If nothing goes wrong, <tok> is filled with pointers to
** the contents of the next token on the input stream.
**
** On correct encoding, the return value and the tok->sct_type
** match, and tok->sct_text and tok->sct_text_n delineate
** the (decoded, if necessary) contents of the token.
** The returned value data is not '\0'-terminated; instead,
** length values are provided.
**
** The input data buffer is overwritten with decoded tokens
** as this happens.
**
** Parameter:
** smc -- stream to read the token from
** tok -- token structure to fill
** flags -- details about what token to read:
** SM_CONF_TOKEN_FLAG_IDENTIFIER
** -- don't combine atoms
**
** Returns:
** a token type > 0 otherwise.
** SM_CONF_TOKEN_ERROR on error (and tok->sct_error
** contains the specific error code)
** SM_CONF_TOKEN_EOF on EOF (and tok->smtc_type is
** also SM_CONF_TOKEN_EOF)
*/
enum sm_conf_token_type_E
sm_conf_token(sm_conf_T *smc, sm_conf_token_T *tok, unsigned int flags)
{
char const *r;
char *e, *s;
enum SM_CONF_ERROR_E err;
SM_IS_CONF_TOKEN(tok);
if (smc == NULL)
return tok->sct_type = SM_CONF_TOKEN_EOF;
sm_conf_skip_space(smc);
s = smc->smc_buf + smc->smc_buf_i;
e = smc->smc_buf + smc->smc_buf_n;
tok->sct_line = smc->smc_line;
tok->sct_text = s;
if (s >= e)
{
smc->smc_buf_i = s - smc->smc_buf;
return tok->sct_type = SM_CONF_TOKEN_EOF;
}
switch (*s)
{
case '"':
tok->sct_type = SM_CONF_TOKEN_STRING;
/* quote-delimited string; unquote on the fly */
for (r = s + 1; r < e;)
{
char *s0;
if (IS_NEWLINE(r, e))
{
tok->sct_text_n = s - tok->sct_text;
tok->sct_error = SM_CONF_ERR_NEWLINE_IN_STRING;
r = NEWLINE_END(r, e);
smc->smc_buf_i = r - smc->smc_buf;
smc->smc_line++;
return SM_CONF_TOKEN_ERROR;
}
else if (*r == '"')
{
r++;
smc->smc_buf_i = r - smc->smc_buf;
if (sm_conf_token_lookahead(smc)
== SM_CONF_TOKEN_STRING)
{
/* concatenate adjacent strings */
sm_conf_skip_space(smc);
r = smc->smc_buf + smc->smc_buf_i + 1;
assert(r <= e);
/* rescan first string char */
continue;
}
else
break;
}
if (*r != '\\')
{
*s++ = *r++;
continue;
}
if (r + 1 >= e)
{
smc->smc_buf_i = smc->smc_buf_n;
tok->sct_text_n = s - tok->sct_text;
tok->sct_error = SM_CONF_ERR_EOF_IN_STRING;
return SM_CONF_TOKEN_ERROR;
}
r++;
s0 = s;
err = sm_conf_token_backslash(&s, &r, e);
if (err != 0)
{
/* reinsert the \ */
if (s > s0)
sm_memmove(s0 + 1, s0, s - s0);
*s0 = '\\';
s++;
smc->smc_buf_i = r - smc->smc_buf;
tok->sct_text_n = s - tok->sct_text;
tok->sct_error = err;
return SM_CONF_TOKEN_ERROR;
}
}
tok->sct_text_n = s - tok->sct_text;
if (r >= e)
{
smc->smc_buf_i = r - smc->smc_buf;
tok->sct_error = SM_CONF_ERR_EOF_IN_STRING;
return SM_CONF_TOKEN_ERROR;
}
smc->smc_buf_i = r - smc->smc_buf;
break;
case '}':
case '{':
case ';':
case ',':
case '=':
/* single-character token. */
smc->smc_buf_i++;
tok->sct_text_n = 1;
return tok->sct_type = *tok->sct_text;
default:
if (!IS_ATOM_CHAR(*s))
{
smc->smc_buf_i++;
tok->sct_text_n = 1;
tok->sct_type = SM_CONF_TOKEN_ATOM;
tok->sct_error = SM_CONF_ERR_BAD_CHAR;
return SM_CONF_TOKEN_ERROR;
}
for (s++; s < e && IS_ATOM_CHAR(*s); s++)
;
smc->smc_buf_i = s - smc->smc_buf;
tok->sct_type = SM_CONF_TOKEN_ATOM;
break;
}
/*
** If we arrive here, we read either a string or an atom.
** <smc->smc_buf_i> has been updated and points just past
** what we read. <s> points just past the token contents so far.
*/
if ((flags & SM_CONF_TOKEN_FLAG_IDENTIFIER) == 0)
{
enum sm_conf_token_type_E la;
/*
** Since we're not parsing just an identifier,
** combine trailing text tokens (strings or atoms)
** with the one we just read.
*/
while ( (la = sm_conf_token_lookahead(smc))
== SM_CONF_TOKEN_STRING
|| la == SM_CONF_TOKEN_ATOM)
{
sm_conf_token_T next;
int have_space;
/*
** If there is white space between here and our next
** token start, add a single space character.
**
** Note that adjacent quoted strings are joined
** by the string parser and will *not* get space
** between them.
*/
have_space = smc->smc_buf_i < smc->smc_buf_n
&& isascii(smc->smc_buf[smc->smc_buf_i])
&& ( isspace(smc->smc_buf[smc->smc_buf_i])
|| smc->smc_buf[smc->smc_buf_i] == '#');
sm_memzero(&next, sizeof(next));
next.sm_magic = SM_CONF_TOKEN_MAGIC;
la = sm_conf_token(smc, &next, flags);
if ( la != SM_CONF_TOKEN_ATOM
&& la != SM_CONF_TOKEN_STRING)
{
SM_ASSERT(la == SM_CONF_TOKEN_ERROR);
SM_ASSERT(next.sct_error != 0);
tok->sct_error = next.sct_error;
tok->sct_text = next.sct_text;
tok->sct_text_n = next.sct_text_n;
return la;
}
/*
** Append the next token to this one, separated
** by white space as needed.
*/
if (have_space)
*s++ = ' ';
sm_memmove(s, next.sct_text, next.sct_text_n);
s += next.sct_text_n;
/*
** Even if this started out as a string,
** by now it's just a (mixed) atom.
*/
tok->sct_type = SM_CONF_TOKEN_ATOM;
}
}
tok->sct_text_n = s - tok->sct_text;
return tok->sct_type;
}
static char *
sm_conf_token_string_char(char *w, int ch)
{
if (ISGRAPH(ch) || ch == ' ')
{
if (ch == '"')
*w++ = '\\';
*w++ = ch;
}
else
{
switch (ch)
{
case '\0':
*w++ = '\\';
*w++ = '0';
break;
case '\a':
*w++ = '\\';
*w++ = 'a';
break;
case '\b':
*w++ = '\\';
*w++ = 'b';
break;
case '\f':
*w++ = '\\';
*w++ = 'f';
break;
case '\n':
*w++ = '\\';
*w++ = 'n';
break;
case '\r':
*w++ = '\\';
*w++ = 'r';
break;
case '\t':
*w++ = '\\';
*w++ = 't';
break;
case '\v':
*w++ = '\\';
*w++ = 'v';
break;
default:
snprintf(w, 5, "\\%3.3o", (unsigned char)ch);
w += 4;
}
}
return w;
}
char const *
sm_conf_token_string(
sm_conf_token_T *tok,
char *buf,
size_t bufsize)
{
size_t n;
char *w;
if (tok == NULL)
return "(null)";
switch (tok->sct_type)
{
case SM_CONF_TOKEN_ERROR:
return "scanner error";
case SM_CONF_TOKEN_EOF:
return "EOF";
case SM_CONF_TOKEN_NONE:
return "internal error";
case SM_CONF_TOKEN_STRING:
if (bufsize <= 8)
return "string";
n = 0;
w = buf;
*w++ = '"';
while ((buf + bufsize) - w >= 8 && n < tok->sct_text_n)
w = sm_conf_token_string_char(w, tok->sct_text[n++]);
if (n < tok->sct_text_n)
{
*w++ = '.';
*w++ = '.';
*w++ = '.';
}
*w++ = '"';
*w = '\0';
return buf;
case SM_CONF_TOKEN_ATOM:
if (bufsize <= 4)
return "atom";
snprintf(buf, bufsize, "'%.*s'",
(int)tok->sct_text_n,
tok->sct_text);
return buf;
case SM_CONF_TOKEN_SEMI:
return "';'";
case SM_CONF_TOKEN_OBRACE:
return "'{'";
case SM_CONF_TOKEN_CBRACE:
return "'}'";
case SM_CONF_TOKEN_COMMA:
return "','";
case SM_CONF_TOKEN_EQUAL:
return "'='";
default:
break;
}
snprintf(buf, bufsize, "<unexpected token %d>", tok->sct_type);
return buf;
}
/*
** SM_CONF_TOKEN_MATCH -- match a tokenized string against data
**
** Compare a user-specified string with a name as recorded in the data.
** Evaluate \ in the user data without allocating a separate copy.
**
** Parameters:
** smc -- handle of the environment in which all this happens
** user -- user-specified string (interpreted like
** tokens in a file)
** user_n -- # of bytes pointed to by <user>
** data -- internal string
** data_n -- # of bytes pointed to by <data>
**
** Returns:
** 1 if the two strings match, 0 if they don't or on
** encoding error.
*/
int
sm_conf_token_match(
sm_conf_T *smc,
char const *user,
size_t user_n,
char const *data,
size_t data_n)
{
char const *user_e;
char const *data_e;
char const *r;
char buf[10], *buf_ptr;
if (user == NULL && data == NULL)
return 1;
if (user == NULL || data == NULL)
return 0;
user_e = user + user_n;
data_e = data + data_n;
while (user < user_e)
{
if (*user == '\\')
{
if (user + 1 >= user_e)
return 0;
user++;
buf_ptr = buf;
if (sm_conf_token_backslash(&buf_ptr, &user, user_e))
return 0;
for (r = buf; r < buf_ptr; r++)
{
if (data >= data_e)
return 0;
if ( isascii(*r)
? tolower(*r) != tolower(*data)
: *r != *data)
return 0;
data++;
}
}
else
{
if ( isascii(*user)
? tolower(*user) != tolower(*data)
: *user != *data)
return 0;
user++;
data++;
}
}
return data == data_e;
}
syntax highlighted by Code2HTML, v. 0.9.1