/*
 * LIB/SPAMFILTER.C	- Spam filtering.  
 *
 *	The spam filter is really simple.  The NNTP-Posting-Host: header
 *	is placed in a relatively large cache with a 32 bit 'hit' counter.
 *	The counter is incremented for each hit, and decremented once a
 *	minute.  If the counter goes above 4, the entry is locked.  If
 *	the counter goes above 16, the posting source is filtered by 
 *	adding the message-id to the history file prior to the article
 *	commit.  The cache entry is not unlocked until the counter 
 *	returns to 0.
 *
 *	NOTE: We cannot open FilterFd in InitSpamFilter() because this
 *	occurs prior to any server forks and will cause the fcntl locks
 *	to be shared across the forks.  This means that if a child 
 *	creates a lock and is then killed, the lock will NOT automatically
 *	be removed.  Thus, we open FilterFd on the first call to SpamFilter()
 *	rather then in InitSpamFilter().
 *
 * (c)Copyright 1998, Matthew Dillon, All Rights Reserved.  Refer to
 *    the COPYRIGHT file in the base directory of this distribution
 *    for specific rights granted.
 */

/*
 * TODO
 *
 */
#include "defs.h"

/*
 * Disable the SHM hash array for now. The mmap'ed version should
 * be fast enough and if we can get away from shared memory problems,
 * the world will be a better place.
 */
#undef	USE_SPAM_SHM

Prototype void SetSpamFilterTrip(int body, int nph);
Prototype void SetSpamFilterOpt(void);
Prototype void InitSpamFilter(void);
Prototype void TermSpamFilter(void);
Prototype int SpamFilter(time_t t, SpamInfo *spamInfo, int *phow);
Prototype void ClearSpamFilterEntry(int which, int entry);
Prototype void DumpSpamFilterCache(FILE *fo, int raw);
Prototype int BodyFilterFd;
Prototype int NphFilterFd;

#define F_B_HSIZE	65536
#define F_B_HMASK	(F_B_HSIZE - 1)
#define F_B_EXPIRE	(60 * 60)	/* one hour expire */
#define F_P_HSIZE	65536
#define F_P_HMASK	(F_B_HSIZE - 1)
#define F_P_EXPIRE	(60 * 60)	/* one hour expire */

typedef struct Filter {
    md5hash_t	f_Hash;		/* hash to check/store			*/
    hash_t	f_MHash;	/* message-id hash			*/
    int		f_Lines;	/* Line count				*/
    time_t	f_Time;		/* time demark				*/
    int32	f_HitCount;	/* incremented per hit, decremented per min */
    int32	f_FilterCount;	/* filtered postings			*/
} Filter;

typedef struct FilterInfo {
    Filter	*HashAry;	/* Shared memory hash			*/
    int		Fd;		/* fd of the disk image of hash		*/
    int		Trip;		/* spam after this many hits		*/
    int		Hsize;		/* number of entries in the memory hash	*/
    int		Hmask;		/* entry mask				*/
    int		Expire;		/* how long (in seconds) entries last	*/
} FilterInfo;

FilterInfo	BodyFilter = { NULL, -1, 0, F_B_HSIZE, F_B_HMASK, F_P_EXPIRE };
FilterInfo	NphFilter = { NULL, -1, 0, F_B_HSIZE, F_B_HMASK, F_P_EXPIRE };
int		BodyFilterFd = -1;
int		NphFilterFd = -1;
int		FilterLock = 4;
int		FilterMax  = 100;

void initSpamData(FilterInfo *f, const char *fname, int *fd);
void termSpamData(FilterInfo *filter);
int openFilterFile(FilterInfo *f, const char *fname);
int spamFilterTable(time_t t, FilterInfo *f, hash_t mhv, md5hash_t *hv, int lines);
void dumpSpamFilterMem(FILE *fo, FilterInfo *fi, char *stype, int raw);

void
SetSpamFilterTrip(int body, int nph)
{
    if (body >= 0)
	BodyFilter.Trip = body;
    if (nph >= 0)
	NphFilter.Trip = nph;
}

void
SetSpamFilterOpt(void)
{
    if (DOpts.SpamFilterOpt != NULL) {
	char *ptr = DOpts.SpamFilterOpt;
	char *oldopts = DOpts.SpamFilterOpt;
	int n;
	int enabled = 0;
	int nntpPostDisabled = 0;
	int ftype = 0;

	TermSpamFilter();
	DOpts.SpamFilterOpt = NULL;
	while (*ptr) {
	    switch (*ptr) {
		case ' ':
		    ++ptr;
		    ftype = 0;
		    break;
		case 'B':
		    ++ptr;
		    if (!isdigit((int)*ptr)) {
			logit(LOG_ERR, "Invalid spam option: %s\n", oldopts);
			free(oldopts);
			return;
		    }
		    n = strtol(ptr, NULL, 0);
		    SetSpamFilterTrip(n, -1);
		    while (isdigit((int)*ptr))
			++ptr;
		    if (n > 0)
			enabled = 1;
		    ftype = 1;
		    break;
		case 'D':
		    ++ptr;
		    SetSpamFilterTrip(-1, 0);
		    nntpPostDisabled = 1;
		    ftype = 0;
		    break;
		case 'e':
		    ++ptr;
		    if (!enabled || !isdigit((int)*ptr)) {
			logit(LOG_ERR, "Invalid spam option: %s\n", oldopts);
			free(oldopts);
			return;
		    }
		    n = strtol(ptr, NULL, 0);
		    switch (ftype) {
			case 1:
			    BodyFilter.Expire = n;
			    break;
			case 2:
			    NphFilter.Expire = n;
			    break;
			default:
			    logit(LOG_ERR, "Invalid spam option: %s\n", oldopts);
			free(oldopts);
			    return;
		    }
		    while (isdigit((int)*ptr))
			++ptr;
		    break;
		case 'N':
		    ++ptr;
		    if (!isdigit((int)*ptr)) {
			logit(LOG_ERR, "Invalid spam option: %s\n", oldopts);
			free(oldopts);
			return;
		    }
		    n = strtol(ptr, NULL, 0);
		    SetSpamFilterTrip(-1, n);
		    while (isdigit((int)*ptr))
			++ptr;
		    if (n > 0)
			enabled = 1;
		    ftype = 2;
		    break;
		case 's':
		    ++ptr;
		    if (!enabled || !isdigit((int)*ptr)) {
			logit(LOG_ERR, "Invalid spam option: %s\n", oldopts);
			free(oldopts);
			return;
		    }
		    n = strtol(ptr, NULL, 0);
		    if ((n ^ (n - 1)) != (n << 1) - 1) {
			logit(LOG_ERR, "spam hash size option (%d) not a power of 2\n",
							n);
			free(oldopts);
			return;
		    }
		    switch (ftype) {
			case 1:
			    if (BodyFilter.HashAry != NULL)
				break;
			    BodyFilter.Hsize = n;
			    BodyFilter.Hmask = n - 1;
			    break;
			case 2:
			    if (NphFilter.HashAry != NULL)
				break;
			    NphFilter.Hsize = n;
			    NphFilter.Hmask = n - 1;
			    break;
			default:
			    logit(LOG_ERR, "Invalid spam option: %s\n", oldopts);
			    free(oldopts);
			    return;
		    }
		    while (isdigit((int)*ptr))
			++ptr;
		    break;
		default:
		    logit(LOG_ERR, "Invalid spam option: %s\n", oldopts);
		    free(oldopts);
		    return;
	    }
	}
	if (enabled) {
	    DOpts.SpamFilterOpt = oldopts;
	} else {
	    free(oldopts);
	}
    }

}

/*
 * InitSpamFilter() - called by master diablo server to initialize the 
 *		      shared memory segment for the spam filter.
 *
 *		      we allocate, map, then remove the shared memory id
 *		      so it is not persistant after the last diablo process
 *		      goes away.  This is necessary because there is really
 *		      no way to reserve a permanent id without possibly 
 *		      stomping on someone else in the system using shared
 *		      memory.
 *
 *		      NOTE: we can open FilterFd here even though fork() will
 *		      share the lseek position because we do not use lseek
 *		      if USE_SPAM_SHM is set.  We also read any preexisting
 *		      spam cache into the shared memory segment.
 */

int
openFilterFile(FilterInfo *fi, const char *fname)
{
    struct stat st;

    if ((fi->Fd = open(PatDbExpand(fname), O_RDWR|O_CREAT, 0644)) < 0)
	logit(LOG_ERR, "Unable open spam cache file: %s %s)\n",
					PatDbExpand(fname), strerror(errno));

    if (fi->HashAry == NULL && fi->Fd >= 0 && fstat(fi->Fd, &st) == 0) {
	if (st.st_size != fi->Hsize * sizeof(Filter)) {
	    int i;
	    Filter f = {{ 0 }};
	    ftruncate(fi->Fd, 0);
	    for (i = 0; i < fi->Hsize; i++)
		write(fi->Fd, &f, sizeof(f));
	    lseek(fi->Fd, 0, SEEK_SET);
	}
	fi->HashAry = xmap(
		NULL, 
		fi->Hsize * sizeof(Filter),
		PROT_READ | (USE_SPAM_RW_MAP * PROT_WRITE),
		MAP_SHARED,
		fi->Fd,
		0
	    );
    }
    if (fi->HashAry == NULL && fi->Fd >= 0) {
	close(fi->Fd);
	fi->Fd = -1;
    }
    return(fi->Fd);
}

void
initSpamData(FilterInfo *f, const char *fname, int *fd)
{
#if USE_SPAM_SHM
    int sid;
    struct shmid_ds ds;
#endif

    if (f->HashAry != NULL)
	return;
#if USE_SPAM_SHM
    sid = shmget(IPC_PRIVATE, f->Hsize * sizeof(Filter), SHM_R|SHM_W);

    if (sid < 0) {
        logit(LOG_ERR, "sysv shared memory alloc failed, is your machine configured with a high enough maximum segment size?");
        exit(1);
    } else if (DebugOpt > 1) {
	printf("Allocated spam shm segment\n");
    }

    f->HashAry = (Filter *)shmat(sid, NULL, SHM_R|SHM_W);

    if (shmctl(sid, IPC_STAT, &ds) < 0 || shmctl(sid, IPC_RMID, &ds) < 0) {
        logit(LOG_ERR, "sysv shmctl stat/rmid failed");
        exit(1);
    }

    if (f->HashAry == (Filter *)-1) {
        f->HashAry = NULL;
        logit(LOG_ERR, "sysv shared memory map failed");
        exit(1);
    }
    bzero(f->HashAry, f->Hsize * sizeof(Filter));

    if ((f->Fd = open(PatDbExpand(fname), O_RDWR, 0644)) >= 0) {
	struct stat sb;
	if (fstat(f->Fd, &sb) == 0 && sb.st_size != f->Hsize * sizeof(Filter))
	    ftruncate(f->Fd, 0);
	else
	    read(f->Fd, f->HashAry, f->Hsize * sizeof(Filter));
	*fd = f->Fd;
    } else {
	logit(LOG_ERR, "Unable to open %s (%s)", PatDbExpand(fname),
							strerror(errno));
	termSpamData(f);
	*fd = -1;
    }
#else
    if (*fd == -1)
	*fd = openFilterFile(f, fname);
#endif
}

void
InitSpamFilter(void)
{
    if (BodyFilter.Trip) {
	initSpamData(&BodyFilter, SpamBodyCachePat, &BodyFilterFd);
	logit(LOG_INFO, "Initialiased internal (body) spamfilter (trip=%d size=%d expire=%d)", 
			BodyFilter.Trip, BodyFilter.Hsize, BodyFilter.Expire);
    }
    if (NphFilter.Trip) {
	initSpamData(&NphFilter, SpamNphCachePat, &NphFilterFd);
	logit(LOG_INFO, "Initialiased internal (nph) spamfilter (trip=%d size=%d expire=%d)", 
			NphFilter.Trip, NphFilter.Hsize, NphFilter.Expire);
    }
}

void
termSpamData(FilterInfo *fi)
{
#if USE_SPAM_SHM
    if (fi->Fd >= 0 && fi->HashAry != NULL) {
	lseek(fi->Fd, 0L, 0);
	write(fi->Fd, fi->HashAry, fi->Hsize * sizeof(Filter));
	ftruncate(fi->Fd, fi->Hsize * sizeof(Filter));
    }
    if (fi->HashAry != NULL) {
	if (shmdt((void *)fi->HashAry) != 0)
	    logit(LOG_ERR, "shmdt error: %s\n", strerror(errno));
	fi->HashAry = NULL;
    }
    if (fi->Fd >= 0) {
	close(fi->Fd);
	fi->Fd = -1;
    }
#else
    if (fi->Fd >= 0) {
	if (fi->HashAry != NULL)
	    xunmap(fi->HashAry, fi->Hsize * sizeof(Filter));
	close(fi->Fd);
	fi->Fd = -1;
	fi->HashAry = NULL;
    }
#endif
}

void
TermSpamFilter(void)
{
    int done = 0;

    if (BodyFilter.Fd != -1 || BodyFilter.HashAry != NULL) {
	termSpamData(&BodyFilter);
	done = 1;
    }
    if (NphFilter.Fd != -1 || NphFilter.HashAry != NULL) {
	termSpamData(&NphFilter);
	done = 1;
    }
    if (done)
	logit(LOG_INFO, "Terminated internal spamfilter");
}

/*
 * SpamFilter() - run spam filter on message-id hash, optional
 * nntpPostingHost.  If nntpPostingHost is not provided, it must
 * be "".  This filter will rate-filter based on nntpPostingHost
 */

int
SpamFilter(time_t t, SpamInfo *spamInfo, int *phow)
{
    int r = 0;

    *phow = 0;

    if (spamInfo == NULL)
	return(r);

    if (NphFilter.Trip && NphFilter.HashAry != NULL &&
				spamInfo->PostingHost != NULL && r == 0) {
	r = spamFilterTable(t, &NphFilter, spamInfo->MsgIdHash,
				&spamInfo->PostingHostHash, spamInfo->Lines);
	if (r != 0)
	    *phow = 1;
    }

    if (BodyFilter.Trip && BodyFilter.HashAry != NULL && r == 0) {
	r = spamFilterTable(t, &BodyFilter, spamInfo->MsgIdHash,
				&spamInfo->BodyHash,
				spamInfo->Lines);
	if (r != 0)
	    *phow = 2;
    }
    return(r);
}

/*
 * Execute spam filter on hash code
 */

int
spamFilterTable(time_t t, FilterInfo *fi, hash_t mhv, md5hash_t *hv, int lines)
{
    int r = 0;
    int i = (hv->h1 ^ hv->h2) & fi->Hmask;	/* hash index */
    int off = i * sizeof(Filter);	/* map offset */
    Filter *f = &fi->HashAry[i];		/* structural pointer	*/
    time_t t0;
    int dhits = 0;
    int isdup = 1;

    hflock(fi->Fd, off, XLOCK_EX);

    t0 = f->f_Time;

    /*
     * calculate delta hits
     */
    {
	int32 dt = (int)(t - t0);

	if (t0 == 0 || dt < -10 || dt > fi->Expire) {
	    /*
	     * Slot is garbaged or long-expired, reset
	     * it.  Set dhits to force override.  Reset
	     * t0.
	     */
	    dhits = -f->f_HitCount;
	    t0 = t;
	    isdup = 0;
	} else {
	    /*
	     * Slot ok (but may or may not match hash code).
	     *
	     * calculate per-minute rate, adjust dhits as if hash
	     * code were ok.
	     */
	    while (dt >= 60 && f->f_HitCount + dhits > 0) {
		--dhits;
		t0 += 60;
		dt -= 60;
	    }
	    /*
	     * Check for duplicate message-id.  If not a duplicate,
	     * enable write-back and bump dhits.
	     */
	    if (f->f_MHash.h1 != mhv.h1 || f->f_MHash.h2 != mhv.h2) {
		++dhits;
		isdup = 0;
	    }
	}
    }

    /*
     * cache hit / miss
     */

    if (f->f_Hash.h1 == hv->h1 &&
	f->f_Hash.h2 == hv->h2 &&
	(lines == 0 || (f->f_Lines == lines))
    ) {
	/*
	 * same-slot, valid.
	 */
	Filter copy;

	copy = *f;

	copy.f_Time = t0;
	copy.f_HitCount += dhits;
	copy.f_MHash = mhv;

	if (copy.f_HitCount <= 0) {	/* handle garbage  */
	    copy.f_HitCount = 0;
	    copy.f_Time = t;
	}
	if (copy.f_HitCount >= FilterMax)	/* handle garbage  */
	    copy.f_HitCount = FilterMax;

	if (copy.f_HitCount >= fi->Trip) {
	    if (isdup == 0)
		++copy.f_FilterCount;
	    r = -copy.f_FilterCount;
	    if (r >= 0)		/* make sure r is negative */
		r = -1;
	}
	if (isdup == 0) {
#if USE_SPAM_RW_MAP
	    *f = copy;
#else
	    lseek(fd, off, 0);		/* seek & lock	*/
	    write(fd, &copy, sizeof(Filter));
#endif
	}
    } else if (f->f_HitCount + dhits < FilterLock) {
	/*
	 * reset slot
	 */
	Filter copy = { { 0 } };

	copy.f_Hash = *hv;
	copy.f_MHash = mhv;
	copy.f_Time = t;
	copy.f_HitCount = 1;
	copy.f_FilterCount = 0;
	copy.f_Lines = lines;

#if USE_SPAM_RW_MAP
	*f = copy;
#else
	lseek(fd, off, 0);		/* seek & lock	*/
	write(fd, &copy, sizeof(Filter));
#endif
    } else {
	logit(LOG_INFO, "SpamFilter, slot %d in use: dt=%d, %d + %d\n",
	    i, 
	    (int)(t - f->f_Time),
	    f->f_HitCount, 
	    dhits
	);
    }
    hflock(fi->Fd, off, XLOCK_UN);

    return(r);
}

void
ClearSpamFilterEntry(int which, int entry)
{
    FilterInfo *f;

    switch (which) {
	case 1:
		f = &BodyFilter;
		if (entry >= f->Hsize)
		    return;
		break;
	case 2:
		f = &NphFilter;
		if (entry >= f->Hsize)
		    return;
		break;
	default:
		return;
    }
    if (entry < 0)
	bzero(f->HashAry, f->Hsize * sizeof(Filter));
    else
	bzero(&f->HashAry[entry], sizeof(Filter));
}

void
dumpSpamFilterMem(FILE *fo, FilterInfo *fi, char *stype, int raw)
{
    time_t t = time(NULL);
    char buf[64];

    if (fi->HashAry != NULL) {
	int i;
	int anyinfo = 0;

	for (i = 0; i < fi->Hsize; ++i) {
	    Filter *f = &fi->HashAry[i];
	    int32 dt = t - f->f_Time;
	    int32 odt = dt;
	    int hits = f->f_HitCount;

	    if (f->f_Time == 0)
		continue;
	    if (!raw && dt >= fi->Expire)
		continue;
	    while (hits > 0 && dt >= 60) {
		--hits;
		dt -= 60;
	    }
	    if (raw || hits >= 0) {
		if (!anyinfo)
		    fprintf(fo, "Internal spamfilter %s hits\n", stype);
		    fprintf(fo, "%5s %16s %-15s %8s %-8s   %5s %5s %5s %5s\n",
				"entry",
				stype,
				"hash",
				"msgid",
				"hash",
				"lines",
				"dtime",
				"hits",
				"filtered"
		    );
		anyinfo = 1;
		fprintf(fo, "%05x %32s %08x.%08x   %5d %5d %5d %5d\n",
				i,
				md5hashstr(&f->f_Hash, buf),
				(int)f->f_MHash.h1,
				(int)f->f_MHash.h2,
				(int)f->f_Lines,
				(int)odt,
				(int)hits,
				(int)f->f_FilterCount
		);
	    }
	}
	if (!anyinfo)
	    fprintf(fo, "No spam %s hits\n", stype);
	fprintf(fo, "-------------------------------------------------\n");
    }
}

void
DumpSpamFilterCache(FILE *fo, int raw)
{
    int dummyHow;

    SpamFilter(time(NULL), NULL, &dummyHow);

    dumpSpamFilterMem(fo, &BodyFilter, "body", raw);
    dumpSpamFilterMem(fo, &NphFilter, "nph", raw);
}



syntax highlighted by Code2HTML, v. 0.9.1