/* * Copyright (c) 2004, 2006 Sendmail, Inc. and its suppliers. * All rights reserved. * * By using this file, you agree to the terms and conditions set * forth in the LICENSE file which can be found at the top level of * the sendmail distribution. */ #include "sm/generic.h" SM_RCSID("@(#)$Id: sm-conf-token.c,v 1.14 2006/01/09 19:06:25 ca Exp $") #if SM_LIBCONF_ALONE #include #include #include #include #include #include #include "sm-conf.h" #include "sm-util.h" #else /* SM_LIBCONF_ALONE */ #include "sm/limits.h" #include "sm/string.h" #include "sm/assert.h" #include "sm/error.h" #include "sm/memops.h" #include "sm/ctype.h" #include #include "sm/sm-conf.h" #endif /* SM_LIBCONF_ALONE */ #include "sm-conf-token.h" #include "sm-conf-state.h" /* SM-CONF-TOKEN.C -- tokenizer. */ #define IS_ATOM_CHAR(r) \ ( !isascii(r) \ || ( isgraph(r) \ && (r) != '{' \ && (r) != '}' \ && (r) != ',' \ && (r) != ';' \ && (r) != '=' \ && (r) != '#' \ && (r) != '"')) #define IS_NEWLINE(x, e) \ ((x) < e && (*(x) == '\r' || *(x) == '\n')) #define NEWLINE_END(r, e) \ ((r) + 1 + ( ((r) + 1 < e) \ && (r)[1] != (r)[0] \ && ((r)[1] == '\n' || (r)[1] == '\r'))) /* ** SM_CONF_LOOKAHEAD -- look ahead to the next token ** ** Parameters: ** smc -- a buffered configuration file. ** ** Returns: ** SM_CONF_TOKEN_EOF on running out of input, ** SM_CONF_TOKEN_ERROR on error, ** otherwise the next non-whitespace, non-comment token. ** ** Side Effects: ** none. ** */ enum sm_conf_token_type_E sm_conf_token_lookahead(sm_conf_T const *smc) { char const *e, *s; if (smc == NULL || smc->smc_buf_i >= smc->smc_buf_n) return SM_CONF_TOKEN_EOF; s = smc->smc_buf + smc->smc_buf_i; e = smc->smc_buf + smc->smc_buf_n; /* Skip white space and comments. */ while (s < e) { if (*s == '#') { while (s < e && !IS_NEWLINE(s, e)) s++; s = NEWLINE_END(s, e); } else if (ISSPACE(*s)) s++; else break; } if (s >= e) return SM_CONF_TOKEN_EOF; switch (*s) { case '"': return SM_CONF_TOKEN_STRING; case '}': case '{': case ';': case ',': case '=': return *s; default: break; } if (!IS_ATOM_CHAR(*s)) return SM_CONF_TOKEN_ERROR; return SM_CONF_TOKEN_ATOM; } /* ** SM_CONF_SKIP_SPACE -- advance pointer to the beginning of the next token ** ** Parameters: ** smc -- a buffered configuration file. ** ** Returns: ** none ** ** Side Effects: ** advance buffer pointer and, if newlines are passed, ** line number. ** */ static void sm_conf_skip_space(sm_conf_T *smc) { char *e, *s; if (smc == NULL || smc->smc_buf_i >= smc->smc_buf_n) return; s = smc->smc_buf + smc->smc_buf_i; e = smc->smc_buf + smc->smc_buf_n; /* Skip white space and comments. */ while (s < e) { if (*s == '#') { while (s < e && !IS_NEWLINE(s, e)) s++; if (s < e) smc->smc_line++; s = NEWLINE_END(s, e); } else if (ISSPACE(*s)) { if (IS_NEWLINE(s, e)) { s = NEWLINE_END(s, e); smc->smc_line++; } else s++; } else break; } smc->smc_buf_i = s - smc->smc_buf; } static int sm_conf_token_hexvalue(unsigned char c) { if (ISDIGIT(c)) return c - '0'; switch (c) { case 'A': case 'a': return 0xA; case 'B': case 'b': return 0xB; case 'C': case 'c': return 0xC; case 'D': case 'd': return 0xD; case 'E': case 'e': return 0xE; case 'F': case 'f': return 0xF; } return -1; } static enum SM_CONF_ERROR_E sm_conf_token_ulong_to_utf8(unsigned long ch, char **out) { unsigned int i, nbits; if (ch <= 127) { *(*out)++ = (unsigned char)ch; return 0; } for (i = 2, nbits = 6 + 5; i <= 6; i++, nbits += 5) if (ch < (unsigned long)1 << nbits) break; if (i > 6) return SM_CONF_ERR_CHAR_OVERFLOW; **out = (unsigned char)((unsigned char)-1 << (8 - i)); **out |= (unsigned char)(ch >> (nbits / 6 * 6)); (*out)++; nbits -= nbits % 6; while (--i > 0) { *(*out)++ = (unsigned char)(0x80 | (0x3F & (ch >> nbits))); nbits -= 6; } return 0; } enum SM_CONF_ERROR_E sm_conf_token_backslash( char **s_ptr, char const **r_ptr, char const *e) { unsigned long ul = 0; size_t i; enum SM_CONF_ERROR_E err; char *s = *s_ptr; char const *r = *r_ptr; err = 0; ul = 0; i = 4; if (r >= e) err = SM_CONF_ERR_EOF_IN_STRING; else if (IS_NEWLINE(r, e)) err = SM_CONF_ERR_NEWLINE_IN_STRING; else { switch (*r++) { case 'a': *s++ = '\a'; break; case 'b': *s++ = '\b'; break; case 'f': *s++ = '\f'; break; case 'n': *s++ = '\n'; break; case 'r': *s++ = '\r'; break; case 't': *s++ = '\t'; break; case 'v': *s++ = '\v'; break; case 'x': /* \xNNNNNN... */ if (r >= e) { *s_ptr = s; *r_ptr = r; return SM_CONF_ERR_EOF_IN_STRING; } if (!ISXDIGIT(*r)) { r++; err = SM_CONF_ERR_HEX_EXPECTED; break; } while (r < e && ISXDIGIT(*r)) { ul = (ul << 4) | sm_conf_token_hexvalue(*r++); if ((ul >> CHAR_BIT) != 0) { err = SM_CONF_ERR_CHAR_OVERFLOW; break; } } if (err == 0) *s++ = (unsigned char)ul; break; case 'U': i = 8; case 'u': /* \u1234, \U12345678 */ while (i-- > 0) { if (r > e) { r = e; err = SM_CONF_ERR_EOF_IN_STRING; break; } if (!ISXDIGIT(*r)) { err = SM_CONF_ERR_HEX_EXPECTED; r++; break; } ul = (ul << 4) | sm_conf_token_hexvalue(*r++); } if (err == 0) /* UTF-8 encode
    */ err = sm_conf_token_ulong_to_utf8(ul, &s); break; default: r--; if (*r < '0' || *r > '7') *s++ = *r++; else { /* \oct [oct [oct]] */ ul = *r++ - '0'; if (r < e && *r >= '0' && *r <= '7') { ul = (ul << 3) | (*r++ - '0'); if (r < e && *r >= '0' && *r <= '7') ul = (ul << 3) | (*r++ - '0'); } if ((ul >> CHAR_BIT) > 0) { err = SM_CONF_ERR_CHAR_OVERFLOW; break; } *s++ = ul; } break; } } if (err == 0) *s_ptr = s; else { if (r > *r_ptr) /* copy the spelling of the problematic token */ sm_memcpy(*s_ptr, *r_ptr, r - *r_ptr); *s_ptr += r - *r_ptr; } *r_ptr = r; return err; } /* ** SM_CONF_TOKEN -- read a token or token group from the input stream ** ** If nothing goes wrong, is filled with pointers to ** the contents of the next token on the input stream. ** ** On correct encoding, the return value and the tok->sct_type ** match, and tok->sct_text and tok->sct_text_n delineate ** the (decoded, if necessary) contents of the token. ** The returned value data is not '\0'-terminated; instead, ** length values are provided. ** ** The input data buffer is overwritten with decoded tokens ** as this happens. ** ** Parameter: ** smc -- stream to read the token from ** tok -- token structure to fill ** flags -- details about what token to read: ** SM_CONF_TOKEN_FLAG_IDENTIFIER ** -- don't combine atoms ** ** Returns: ** a token type > 0 otherwise. ** SM_CONF_TOKEN_ERROR on error (and tok->sct_error ** contains the specific error code) ** SM_CONF_TOKEN_EOF on EOF (and tok->smtc_type is ** also SM_CONF_TOKEN_EOF) */ enum sm_conf_token_type_E sm_conf_token(sm_conf_T *smc, sm_conf_token_T *tok, unsigned int flags) { char const *r; char *e, *s; enum SM_CONF_ERROR_E err; SM_IS_CONF_TOKEN(tok); if (smc == NULL) return tok->sct_type = SM_CONF_TOKEN_EOF; sm_conf_skip_space(smc); s = smc->smc_buf + smc->smc_buf_i; e = smc->smc_buf + smc->smc_buf_n; tok->sct_line = smc->smc_line; tok->sct_text = s; if (s >= e) { smc->smc_buf_i = s - smc->smc_buf; return tok->sct_type = SM_CONF_TOKEN_EOF; } switch (*s) { case '"': tok->sct_type = SM_CONF_TOKEN_STRING; /* quote-delimited string; unquote on the fly */ for (r = s + 1; r < e;) { char *s0; if (IS_NEWLINE(r, e)) { tok->sct_text_n = s - tok->sct_text; tok->sct_error = SM_CONF_ERR_NEWLINE_IN_STRING; r = NEWLINE_END(r, e); smc->smc_buf_i = r - smc->smc_buf; smc->smc_line++; return SM_CONF_TOKEN_ERROR; } else if (*r == '"') { r++; smc->smc_buf_i = r - smc->smc_buf; if (sm_conf_token_lookahead(smc) == SM_CONF_TOKEN_STRING) { /* concatenate adjacent strings */ sm_conf_skip_space(smc); r = smc->smc_buf + smc->smc_buf_i + 1; assert(r <= e); /* rescan first string char */ continue; } else break; } if (*r != '\\') { *s++ = *r++; continue; } if (r + 1 >= e) { smc->smc_buf_i = smc->smc_buf_n; tok->sct_text_n = s - tok->sct_text; tok->sct_error = SM_CONF_ERR_EOF_IN_STRING; return SM_CONF_TOKEN_ERROR; } r++; s0 = s; err = sm_conf_token_backslash(&s, &r, e); if (err != 0) { /* reinsert the \ */ if (s > s0) sm_memmove(s0 + 1, s0, s - s0); *s0 = '\\'; s++; smc->smc_buf_i = r - smc->smc_buf; tok->sct_text_n = s - tok->sct_text; tok->sct_error = err; return SM_CONF_TOKEN_ERROR; } } tok->sct_text_n = s - tok->sct_text; if (r >= e) { smc->smc_buf_i = r - smc->smc_buf; tok->sct_error = SM_CONF_ERR_EOF_IN_STRING; return SM_CONF_TOKEN_ERROR; } smc->smc_buf_i = r - smc->smc_buf; break; case '}': case '{': case ';': case ',': case '=': /* single-character token. */ smc->smc_buf_i++; tok->sct_text_n = 1; return tok->sct_type = *tok->sct_text; default: if (!IS_ATOM_CHAR(*s)) { smc->smc_buf_i++; tok->sct_text_n = 1; tok->sct_type = SM_CONF_TOKEN_ATOM; tok->sct_error = SM_CONF_ERR_BAD_CHAR; return SM_CONF_TOKEN_ERROR; } for (s++; s < e && IS_ATOM_CHAR(*s); s++) ; smc->smc_buf_i = s - smc->smc_buf; tok->sct_type = SM_CONF_TOKEN_ATOM; break; } /* ** If we arrive here, we read either a string or an atom. ** smc_buf_i> has been updated and points just past ** what we read. points just past the token contents so far. */ if ((flags & SM_CONF_TOKEN_FLAG_IDENTIFIER) == 0) { enum sm_conf_token_type_E la; /* ** Since we're not parsing just an identifier, ** combine trailing text tokens (strings or atoms) ** with the one we just read. */ while ( (la = sm_conf_token_lookahead(smc)) == SM_CONF_TOKEN_STRING || la == SM_CONF_TOKEN_ATOM) { sm_conf_token_T next; int have_space; /* ** If there is white space between here and our next ** token start, add a single space character. ** ** Note that adjacent quoted strings are joined ** by the string parser and will *not* get space ** between them. */ have_space = smc->smc_buf_i < smc->smc_buf_n && isascii(smc->smc_buf[smc->smc_buf_i]) && ( isspace(smc->smc_buf[smc->smc_buf_i]) || smc->smc_buf[smc->smc_buf_i] == '#'); sm_memzero(&next, sizeof(next)); next.sm_magic = SM_CONF_TOKEN_MAGIC; la = sm_conf_token(smc, &next, flags); if ( la != SM_CONF_TOKEN_ATOM && la != SM_CONF_TOKEN_STRING) { SM_ASSERT(la == SM_CONF_TOKEN_ERROR); SM_ASSERT(next.sct_error != 0); tok->sct_error = next.sct_error; tok->sct_text = next.sct_text; tok->sct_text_n = next.sct_text_n; return la; } /* ** Append the next token to this one, separated ** by white space as needed. */ if (have_space) *s++ = ' '; sm_memmove(s, next.sct_text, next.sct_text_n); s += next.sct_text_n; /* ** Even if this started out as a string, ** by now it's just a (mixed) atom. */ tok->sct_type = SM_CONF_TOKEN_ATOM; } } tok->sct_text_n = s - tok->sct_text; return tok->sct_type; } static char * sm_conf_token_string_char(char *w, int ch) { if (ISGRAPH(ch) || ch == ' ') { if (ch == '"') *w++ = '\\'; *w++ = ch; } else { switch (ch) { case '\0': *w++ = '\\'; *w++ = '0'; break; case '\a': *w++ = '\\'; *w++ = 'a'; break; case '\b': *w++ = '\\'; *w++ = 'b'; break; case '\f': *w++ = '\\'; *w++ = 'f'; break; case '\n': *w++ = '\\'; *w++ = 'n'; break; case '\r': *w++ = '\\'; *w++ = 'r'; break; case '\t': *w++ = '\\'; *w++ = 't'; break; case '\v': *w++ = '\\'; *w++ = 'v'; break; default: snprintf(w, 5, "\\%3.3o", (unsigned char)ch); w += 4; } } return w; } char const * sm_conf_token_string( sm_conf_token_T *tok, char *buf, size_t bufsize) { size_t n; char *w; if (tok == NULL) return "(null)"; switch (tok->sct_type) { case SM_CONF_TOKEN_ERROR: return "scanner error"; case SM_CONF_TOKEN_EOF: return "EOF"; case SM_CONF_TOKEN_NONE: return "internal error"; case SM_CONF_TOKEN_STRING: if (bufsize <= 8) return "string"; n = 0; w = buf; *w++ = '"'; while ((buf + bufsize) - w >= 8 && n < tok->sct_text_n) w = sm_conf_token_string_char(w, tok->sct_text[n++]); if (n < tok->sct_text_n) { *w++ = '.'; *w++ = '.'; *w++ = '.'; } *w++ = '"'; *w = '\0'; return buf; case SM_CONF_TOKEN_ATOM: if (bufsize <= 4) return "atom"; snprintf(buf, bufsize, "'%.*s'", (int)tok->sct_text_n, tok->sct_text); return buf; case SM_CONF_TOKEN_SEMI: return "';'"; case SM_CONF_TOKEN_OBRACE: return "'{'"; case SM_CONF_TOKEN_CBRACE: return "'}'"; case SM_CONF_TOKEN_COMMA: return "','"; case SM_CONF_TOKEN_EQUAL: return "'='"; default: break; } snprintf(buf, bufsize, "", tok->sct_type); return buf; } /* ** SM_CONF_TOKEN_MATCH -- match a tokenized string against data ** ** Compare a user-specified string with a name as recorded in the data. ** Evaluate \ in the user data without allocating a separate copy. ** ** Parameters: ** smc -- handle of the environment in which all this happens ** user -- user-specified string (interpreted like ** tokens in a file) ** user_n -- # of bytes pointed to by ** data -- internal string ** data_n -- # of bytes pointed to by ** ** Returns: ** 1 if the two strings match, 0 if they don't or on ** encoding error. */ int sm_conf_token_match( sm_conf_T *smc, char const *user, size_t user_n, char const *data, size_t data_n) { char const *user_e; char const *data_e; char const *r; char buf[10], *buf_ptr; if (user == NULL && data == NULL) return 1; if (user == NULL || data == NULL) return 0; user_e = user + user_n; data_e = data + data_n; while (user < user_e) { if (*user == '\\') { if (user + 1 >= user_e) return 0; user++; buf_ptr = buf; if (sm_conf_token_backslash(&buf_ptr, &user, user_e)) return 0; for (r = buf; r < buf_ptr; r++) { if (data >= data_e) return 0; if ( isascii(*r) ? tolower(*r) != tolower(*data) : *r != *data) return 0; data++; } } else { if ( isascii(*user) ? tolower(*user) != tolower(*data) : *user != *data) return 0; user++; data++; } } return data == data_e; }