/*
 * UTIL/CRCTEST.C	- CRC tester, by Matt Dillon
 * 
 * This program is designed to test an N-bit CRC against a word dictionary
 * which you pipe into it.   It is not required for normal diablo operation.
 *
 * Warning: This program will eat a lot of memory with large sets.  If the
 * set is known to be unique, you can use -u to reduce the memory footprint.
 *
 * cat unique-words | CRCTEST [-u] [-v] [-q] [-h#]
 *
 *	-h#	set final hash size, in bits 16-64, Default is 64 bits.
 *	-u	assume unique input, do not store string contents
 *	-v	verbose output, print collisions
 *	-q	quiet output, do not print the count every 100,000 tests
 *
 * The expected number of collisions is (NSAMP * (NSAMP-1) / 2) / 2^CRCBITS.
 *
 * This is calculated through statistics.  If you had 7 samples and an 8 bit
 * CRC (256 slots), the number of collisions is
 *
 *		sample #1	0/256
 *		sample #2	1/256
 *		sample #3	2/256
 *		sample #4	3/256
 *		sample #5	4/256
 *		sample #6	5/256
 *		sample #7	6/256
 *	+	sample #8	7/256
 *	------------------------------
 *		    [ 8*(8-1)/2 ] / 2^CRCBITS
 * 
 *  	NOTE!! this only works if 2^CRCBITS is substantially larger then NSAMP
 *	because we aren't taking into account the fact that a prior samples
 *	may collide and not increment the chance of collision for later 
 *	samples.
 *
 * So, for example, a 36 bit CRC with 1M samples should result in around 7
 * collisions.  A 42 bit CRC with 1M samples should result in around 0.1
 * collision.  A 42 bit CRC with 3 million samples should result in around 1 
 * collision.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

typedef unsigned int hint_t;	/* we want a 32 bit unsigned integer here */

typedef struct hash_t {
    hint_t	h1;
    hint_t	h2;
} hash_t;

typedef struct Hash {
    struct Hash *ha_Next;
    hash_t	ha_Hv;
} Hash;

#define TESTHSIZE	(4 * 1024 * 1024)
#define TESTHMASK	(TESTHSIZE - 1)

void inithash(void);
hash_t testhash(const char *p);

Hash	*HashAry[TESTHSIZE];
int	UniqueOpt;
int	HashLimit = 64;
int	VerboseOpt = 0;
int	QuietOpt = 0;
void	*rmalloc(int bytes);

int
main(int ac, char **av)
{
    char buf[256];
    int count = 0;
    int total = 0;
    int skip = 100000;
    int i;

    for (i = 1; i < ac; ++i) {
	char *p = av[i];

	if (*p == '-') {
	    p += 2;
	    switch(p[-1]) {
	    case 'u':
		UniqueOpt = 1;
		break;
	    case 'h':
		/*
		 * We can't go above 64 for obvious reasons.  We can't go
		 * below 16 due to the way I generate the polynomial.
		 */
		HashLimit = strtol(p, NULL, 0);
		if (HashLimit > 64 || HashLimit < 16) {
		    printf("valid values for -h between 16 & 64 inclusive\n");
		    exit(1);
		}
		break;
	    case 'q':
		QuietOpt = 1;
		break;
	    case 'v':
		VerboseOpt = 1;
		break;
	    default:
		fprintf(stderr, "Unknown option: %s\n", p - 2);
		exit(1);
	    }
	}
    }

    inithash();

    while (fgets(buf, sizeof(buf), stdin) != NULL) {
	int i;
	hash_t hv;
	Hash *h;
	char *s;

	for (s = strtok(buf, " ,\t\r\n"); s; s = strtok(NULL, " ,\t\r\n")) {
	    hv = testhash(s);
	    i = (hv.h1 ^ hv.h2) & TESTHMASK;
	    /* printf("%08x.%08x (%d) %s\n", hv.h1, hv.h2, i, s); */
	    for (h = HashAry[i]; h; h = h->ha_Next) {
		if (h->ha_Hv.h1 == hv.h1 && h->ha_Hv.h2 == hv.h2) {
		    if (UniqueOpt || strcmp(s, (char *)(h + 1)) != 0) {
			if (VerboseOpt) {
			    printf("Collision: %s\t%s\n", 
				s, 
				((UniqueOpt) ? "?" : (char *)(h + 1))
			    );
			}
			++count;
			++total;
		    }
		    break;
		}
	    }
	    if (h == NULL) {
		h = rmalloc(sizeof(Hash) + ((UniqueOpt) ? 0 : strlen(s) + 1));
		h->ha_Next = HashAry[i];
		h->ha_Hv = hv;
		if (UniqueOpt == 0)
		    strcpy((char *)(h + 1), s);
		HashAry[i] = h;
		++total;
	    }
	}
	if (total >= skip) {
	    if (QuietOpt == 0) {
		printf("Count %d/%d\n", count, total);
		fflush(stdout);
	    }
	    skip += 100000;
	}
    }
    printf("Count %d/%d\n", count, total);
    return(0);
}

/*
 * Poly: 0x00600340.00F0D50A
 *
 */

#define HINIT1	0xFAC432B1UL
#define HINIT2	0x0CD5E44AUL

#define POLY1	0x00600340UL
#define POLY2	0x00F0D50BUL

hash_t CrcXor[256];
hash_t Poly[64+1];

void
inithash(void)
{
    int i;

    /*
     * Polynomials to use for various crc sizes.  Start with the 64 bit
     * polynomial and shift it right to generate the polynomials for fewer
     * bits.  Note that the polynomial for N bits has no bit set above N-8.
     * This allows us to do a simple table-driven CRC.
     */

    Poly[64].h1 = POLY1;
    Poly[64].h2 = POLY2;
    for (i = 63; i >= 16; --i) {
	Poly[i].h1 = Poly[i+1].h1 >> 1;
	Poly[i].h2 = (Poly[i+1].h2 >> 1) | ((Poly[i+1].h1 & 1) << 31) | 1;
    }

    for (i = 0; i < 256; ++i) {
	int j;
	int v = i;
	hash_t hv = { 0, 0 };

	for (j = 0; j < 8; ++j, (v <<= 1)) {
	    hv.h1 <<= 1;
	    if (hv.h2 & 0x80000000UL)
		hv.h1 |= 1;
	    hv.h2 = (hv.h2 << 1);
	    if (v & 0x80) {
		hv.h1 ^= Poly[HashLimit].h1;
		hv.h2 ^= Poly[HashLimit].h2;
	    }
	}
	CrcXor[i] = hv;
    }
}

/*
 * testhash() - do the CRC.  The complexity is simply due to the programmable
 *		nature of the number of bits.   We extract the top 8 bits to
 *		use as a table lookup to obtain the polynomial XOR 8 bits at
 *		a time rather then 1 bit at a time.
 */

hash_t
testhash(const char *p)
{
    hash_t hv = { HINIT1, HINIT2 };

    if (HashLimit <= 32) {
	int s = HashLimit - 8;
	hint_t m = (hint_t)-1 >> (32 - HashLimit);

	hv.h1 = 0;
	hv.h2 &= m;

	while (*p) {
	    int i = (hv.h2 >> s) & 255;
	    /* printf("i = %d %08lx\n", i, CrcXor[i].h2); */
	    hv.h2 = ((hv.h2 << 8) & m) ^ *p ^ CrcXor[i].h2;
	    ++p;
	}
    } else if (HashLimit < 32+8) {
	int s2 = 32 + 8 - HashLimit;	/* bits in byte from h2 */
	hint_t m = (hint_t)-1 >> (64 - HashLimit);

	hv.h1 &= m;
	while (*p) {
	    int i = ((hv.h1 << s2) | (hv.h2 >> (32 - s2))) & 255;
	    hv.h1 = (((hv.h1 << 8) ^ (int)(hv.h2 >> 24)) & m) ^ CrcXor[i].h1;
	    hv.h2 = (hv.h2 << 8) ^ *p ^ CrcXor[i].h2;
	    ++p;
	}
    } else {
	int s = HashLimit - 40;
	hint_t m = (hint_t)-1 >> (64 - HashLimit);

	hv.h1 &= m;
	while (*p) {
	    int i = (hv.h1 >> s) & 255;
	    hv.h1 = ((hv.h1 << 8) & m) ^ (int)(hv.h2 >> 24) ^ CrcXor[i].h1;
	    hv.h2 = (hv.h2 << 8) ^ *p ^ CrcXor[i].h2;
	    ++p;
	}
    }
    /* printf("%08lx.%08lx\n", (long)hv.h1, (long)hv.h2); */
    return(hv);
}

void *
rmalloc(int bytes)
{
    static char *RBuf = NULL;
    static int	  RSize = 0;

    bytes = (bytes + 3) & ~3;

    if (bytes > RSize) {
	RBuf = malloc(65536);
	RSize = 65536;
    }
    RBuf += bytes;
    RSize -= bytes;
    return(RBuf - bytes);
}



syntax highlighted by Code2HTML, v. 0.9.1