/* $Id: tokenizer.c,v 1.17 2006/05/13 01:12:59 jonz Exp $ */ /* DSPAM COPYRIGHT (C) 2002-2006 JONATHAN A. ZDZIARSKI This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* * tokenizer.c - tokenizer functions * * DESCRIPTION * The tokenizer subroutines are responsible for decomposing a message into * its colloquial components. All components are stored collectively in * a diction object, passed into the function. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #ifdef HAVE_UNISTD_H #include #endif #include #include #ifdef TIME_WITH_SYS_TIME # include # include #else # ifdef HAVE_SYS_TIME_H # include # else # include # endif #endif #include "config.h" #include "tokenizer.h" #include "util.h" #include "libdspam.h" #include "language.h" #ifdef NCORE #include #include #include "ncore_adp.h" #endif #ifdef NCORE extern nc_dev_t g_ncDevice; extern NC_STREAM_CTX g_ncDelimiters; #endif /* * _ds_tokenize() - tokenize the message * * DESCRIPTION * tokenizes the supplied message * * INPUT ARGUMENTS * DSPAM_CTX *CTX pointer to context * char *header pointer to message header * char *body pointer to message body * ds_diction_t diction to store components * * RETURN VALUES * standard errors on failure * zero if successful * */ int _ds_tokenize (DSPAM_CTX * CTX, char *headers, char *body, ds_diction_t diction) { if (diction == NULL) return EINVAL; if (CTX->flags & DSF_SBPH) return _ds_tokenize_sbph(CTX, headers, body, diction); else return _ds_tokenize_ngram(CTX, headers, body, diction); } int _ds_tokenize_ngram( DSPAM_CTX *CTX, char *headers, char *body, ds_diction_t diction) { char *token; /* current token */ char *previous_token = NULL; /* used for bigrams (chained tokens) */ #ifdef NCORE nc_strtok_t NTX; #endif char *line = NULL; /* header broken up into lines */ char *ptrptr; char heading[128]; /* current heading */ int l; struct nt *header = NULL; struct nt_node *node_nt; struct nt_c c_nt; /* Tokenize URLs in message */ _ds_url_tokenize(diction, body, "http://"); _ds_url_tokenize(diction, body, "www."); _ds_url_tokenize(diction, body, "href="); /* * Header Tokenization */ header = nt_create (NT_CHAR); if (header == NULL) { LOG (LOG_CRIT, ERR_MEM_ALLOC); return EUNKNOWN; } line = strtok_r (headers, "\n", &ptrptr); while (line) { nt_add (header, line); line = strtok_r (NULL, "\n", &ptrptr); } node_nt = c_nt_first (header, &c_nt); heading[0] = 0; while (node_nt) { int multiline; #ifdef VERBOSE LOGDEBUG("processing line: %s", node_nt->ptr); #endif line = node_nt->ptr; token = strtok_r (line, ":", &ptrptr); if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " ")) { multiline = 0; strlcpy (heading, token, 128); previous_token = NULL; } else { multiline = 1; } #ifdef VERBOSE LOGDEBUG ("Reading '%s' header from: '%s'", heading, line); #endif if (CTX->flags & DSF_WHITELIST) { /* Use the entire From: line for auto-whitelisting */ if (!strcmp(heading, "From")) { char wl[256]; char *fromline = line + 5; unsigned long long whitelist_token; if (fromline[0] == 32) fromline++; snprintf(wl, sizeof(wl), "%s*%s", heading, fromline); whitelist_token = _ds_getcrc64(wl); ds_diction_touch(diction, whitelist_token, wl, 0); diction->whitelist_token = whitelist_token; } } /* Received headers use a different set of delimiters to preserve things like ip addresses */ token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr); while (token) { l = strlen(token); if (l >= 1 && l < 50) { #ifdef VERBOSE LOGDEBUG ("Processing '%s' token in '%s' header", token, heading); #endif /* Process "current" token */ if (!_ds_process_header_token (CTX, token, previous_token, diction, heading) && (CTX->flags & DSF_CHAINED)) { previous_token = token; } } token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr); } previous_token = NULL; node_nt = c_nt_next (header, &c_nt); } nt_destroy (header); /* * Body Tokenization */ #ifdef VERBOSE LOGDEBUG("parsing message body"); #endif #ifdef NCORE token = strtok_n (body, &g_ncDelimiters, &NTX); #else token = strtok_r (body, DELIMITERS, &ptrptr); #endif while (token != NULL) { l = strlen (token); if (l >= 1 && l < 50) { /* Process "current" token */ if (!_ds_process_body_token (CTX, token, previous_token, diction) && (CTX->flags & DSF_CHAINED)) { previous_token = token; } } #ifdef NCORE token = strtok_n (NULL, NULL, &NTX); #else token = strtok_r (NULL, DELIMITERS, &ptrptr); #endif } /* Final token reassembly (anything left in the buffer) */ return 0; } int _ds_tokenize_sbph( DSPAM_CTX *CTX, char *headers, char *body, ds_diction_t diction) { int i; char *token; /* current token */ char *previous_tokens[SBPH_SIZE]; /* sbph chain */ #ifdef NCORE nc_strtok_t NTX; #endif char *line = NULL; /* header broken up into lines */ char *ptrptr; char heading[128]; /* current heading */ int l; struct nt *header = NULL; struct nt_node *node_nt; struct nt_c c_nt; for(i=0;iptr; token = strtok_r (line, ":", &ptrptr); if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " ")) { multiline = 0; strlcpy (heading, token, 128); _ds_sbph_clear(previous_tokens); } else { multiline = 1; } #ifdef VERBOSE LOGDEBUG ("Reading '%s' header from: '%s'", heading, line); #endif if (CTX->flags & DSF_WHITELIST) { /* Use the entire From: line for auto-whitelisting */ if (!strcmp(heading, "From")) { char wl[256]; char *fromline = line + 5; unsigned long long whitelist_token; if (fromline[0] == 32) fromline++; snprintf(wl, sizeof(wl), "%s*%s", heading, fromline); whitelist_token = _ds_getcrc64(wl); ds_diction_touch(diction, whitelist_token, wl, 0); diction->whitelist_token = whitelist_token; } } /* Received headers use a different set of delimiters to preserve things like ip addresses */ token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr); while (token) { l = strlen(token); if (l > 0 && l < 50) { #ifdef VERBOSE LOGDEBUG ("Processing '%s' token in '%s' header", token, heading); #endif _ds_map_header_token (CTX, token, previous_tokens, diction, heading); } token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr); } for(i=0;i 0 && l < 50) { /* Process "current" token */ _ds_map_body_token (CTX, token, previous_tokens, diction); } #ifdef NCORE token = strtok_n (NULL, NULL, &NTX); #else token = strtok_r (NULL, DELIMITERS, &ptrptr); #endif } for(i=0;iconfig->attributes, "IgnoreHeader", heading)) return 0; if (!strncmp(heading, "X-DSPAM-", 8)) return 0; /* This is where we used to ignore certain headings */ if (heading[0] != 0) snprintf (combined_token, sizeof (combined_token), "%s*%s", heading, token); else strlcpy (combined_token, token, sizeof (combined_token)); tweaked_token = _ds_truncate_token(token); if (tweaked_token == NULL) return EUNKNOWN; snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token); crc = _ds_getcrc64 (combined_token); #ifdef VERBOSE LOGDEBUG ("Token Hit: '%s'", combined_token); #endif ds_diction_touch(diction, crc, combined_token, 0); if ((CTX->flags & DSF_CHAINED) && previous_token != NULL) { char *tweaked_previous; tweaked_previous = _ds_truncate_token(previous_token); if (tweaked_previous == NULL) return EUNKNOWN; snprintf (combined_token, sizeof (combined_token), "%s*%s+%s", heading, tweaked_previous, tweaked_token); crc = _ds_getcrc64 (combined_token); ds_diction_touch(diction, crc, combined_token, DSD_CHAINED); free(tweaked_previous); } free(tweaked_token); return 0; } int _ds_process_body_token (DSPAM_CTX * CTX, char *token, const char *previous_token, ds_diction_t diction) { char combined_token[256]; unsigned long long crc; char *tweaked_token; tweaked_token = _ds_truncate_token(token); if (tweaked_token == NULL) return EUNKNOWN; crc = _ds_getcrc64 (tweaked_token); ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT); if ((CTX->flags & DSF_CHAINED) && previous_token != NULL) { char *tweaked_previous = _ds_truncate_token(previous_token); if (tweaked_previous == NULL) return EUNKNOWN; snprintf (combined_token, sizeof (combined_token), "%s+%s", tweaked_previous, tweaked_token); crc = _ds_getcrc64 (combined_token); ds_diction_touch(diction, crc, combined_token, DSD_CHAINED | DSD_CONTEXT); free(tweaked_previous); } free(tweaked_token); return 0; } int _ds_map_header_token (DSPAM_CTX * CTX, char *token, char **previous_tokens, ds_diction_t diction, const char *heading) { int i, mask, t; unsigned long long crc; char key[256]; int active = 0, top; if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading)) return 0; if (!strncmp(heading, "X-DSPAM-", 8)) return 0; /* Shift all previous tokens up */ for(i=0;i2 && !strcmp((key+kl)-2, "+#")) { key[kl-2] = 0; kl -=2; } while(!strncmp(k, "#+", 2)) { top = 0; k+=2; } if (top) { snprintf(hkey, sizeof(hkey), "%s*%s", heading, k); crc = _ds_getcrc64(hkey); ds_diction_touch(diction, crc, hkey, DSD_CONTEXT); } } } return 0; } int _ds_map_body_token (DSPAM_CTX * CTX, char *token, char **previous_tokens, ds_diction_t diction) { int i, mask, t; int top; unsigned long long crc; char key[256]; int active = 0; /* Shift all previous tokens up */ for(i=0;i2 && !strcmp((key+kl)-2, "+#")) { key[kl-2] = 0; kl -=2; } while(!strncmp(k, "#+", 2)) { top = 0; k+=2; } if (top) { crc = _ds_getcrc64(k); ds_diction_touch(diction, crc, k, DSD_CONTEXT); } } } return 0; } /* * _ds_degenerate_message() * * DESCRIPTION * Degenerate the message into headers, body and tokenizable pieces * * This function is responsible for analyzing the actualized message and * degenerating it into only the components which are tokenizable. This * process effectively eliminates much HTML noise, special symbols, or * other non-tokenizable/non-desirable components. What is left is the * bulk of the message and only desired tags, URLs, and other data. * * INPUT ARGUMENTS * header pointer to buffer containing headers * body pointer to buffer containing message body */ int _ds_degenerate_message(DSPAM_CTX *CTX, buffer * header, buffer * body) { char *decode, *x, *y; struct nt_node *node_nt, *node_header; struct nt_c c_nt, c_nt2; int i = 0; char heading[1024]; if (! CTX->message) { LOG (LOG_WARNING, "_ds_actualize_message() failed: CTX->message is NULL"); return EUNKNOWN; } /* Iterate through each component and create large header/body buffers */ node_nt = c_nt_first (CTX->message->components, &c_nt); while (node_nt != NULL) { struct _ds_message_part *block = (struct _ds_message_part *) node_nt->ptr; #ifdef VERBOSE LOGDEBUG ("Processing component %d", i); #endif if (! block->headers || ! block->headers->items) { #ifdef VERBOSE LOGDEBUG (" : End of Message Identifier"); #endif } else { struct _ds_header_field *current_header; /* Accumulate the headers */ node_header = c_nt_first (block->headers, &c_nt2); while (node_header != NULL) { current_header = (struct _ds_header_field *) node_header->ptr; snprintf (heading, sizeof (heading), "%s: %s\n", current_header->heading, current_header->data); buffer_cat (header, heading); node_header = c_nt_next (block->headers, &c_nt2); } decode = block->body->data; if (block->media_type == MT_TEXT || block->media_type == MT_MESSAGE || block->media_type == MT_UNKNOWN || (block->media_type == MT_MULTIPART && !i)) { /* Accumulate the bodies, skip attachments */ if ( ( block->encoding == EN_BASE64 || block->encoding == EN_QUOTED_PRINTABLE) && ! block->original_signed_body) { if (block->content_disposition != PCD_ATTACHMENT) { LOGDEBUG ("decoding message block from encoding type %d", block->encoding); decode = _ds_decode_block (block); } } /* We found a tokenizable body component, add prefilters */ if (decode) { char *decode2 = strdup(decode); size_t len = strlen(decode2) + 1; /* -- PREFILTERS BEGIN -- */ /* Hexadecimal 8-Bit Encodings */ if (block->encoding == EN_8BIT) { char hex[5] = "0x00"; int conv; x = strchr(decode2, '%'); while(x) { if (isxdigit((unsigned char) x[1]) && isxdigit((unsigned char) x[2])) { hex[2] = x[1]; hex[3] = x[2]; conv = strtol(hex, NULL, 16); if (conv) { x[0] = conv; memmove(x+1, x+3, len-((x+3)-decode2)); len -= 2; } } x = strchr(x+1, '%'); } } /* HTML-Specific Filters */ if (block->media_subtype == MST_HTML) { /* Remove long HTML Comments */ x = strstr (decode2, ""); if (y) { memmove(x, y + 3, len-((y+3)-decode2)); //strlen (y + 3) + 1); len -= ((y+3) - x); x = strstr (x, "