/*
 * LIB/HASH.C
 *
 * (c)Copyright 1997, Matthew Dillon, All Rights Reserved.  Refer to
 *    the COPYRIGHT file in the base directory of this distribution 
 *    for specific rights granted.
 *
 * base32 encode is based on code that is
 * Copyright (c) 1995, 1996, 1997 Kungliga Tekniska H<F6>gskolan
 * (Royal Institute of Technology, Stockholm, Sweden).
 * All rights reserved.
 *
 * CRC hash generator
 *
 * NOTE!  If using this CRC code in other projects, please fix the folding
 * code (see 'folding' below).  The '| 1' should be '^ 1'.  It cannot be
 * changed in Diablo, unfortunately, without putting everying through the
 * wringer again which I do not want to do.
 */

#include "defs.h"

Prototype int quickhash(const char *st);
Prototype hash_t hhash(const char *msgid);
Prototype void bhash_init(void);
Prototype void bhash_update(const char *p, int len);
Prototype void bhash_final(md5hash_t *h);
Prototype md5hash_t *md5hash(const char *st);
Prototype char *md5hashstr(md5hash_t *hash, char *buf);
Prototype void bhash(hash_t *h, const char *p, int len);
Prototype int GFIndex(char c);
Prototype const char *GFHash(const char *group, GroupHashType *method);
Prototype const char *GFName(const char *group, int ftype, int artBase, int wantdir, int iter, GroupHashType *method);
void FindGroupDir(const char *group, char *path);
Prototype int ExtractGroupHashInfo(const char *name, char *Hash, long *artBase, int *iter);
Prototype char *GetHash(GroupHashType *Hash, char *hashStr);
Prototype void SetCacheHash(struct CacheHash_t* ch, const char *group, int iter, GroupHashType *method);

#define P1	7834891
#define P2	6017489

#define HINIT1  0xFAC432B1
#define HINIT2  0x0CD5E44A

#define POLY1   0x00600340UL
#define POLY2   0x00F0D50BUL
#define OPOLY2  0x00F0D50AUL

#ifdef TEST
int DOpts.HashMethod = HASH_CRC;
#endif

hash_t CrcXor[3][256];
int    DidCrcInit;

struct diablo_MD5Context HMD5ctx;
struct diablo_MD5Context BMD5ctx;	/* article body MD5 context */

int
quickhash(const char *st)
{
    register int rval = 0;
    register unsigned char *p = (unsigned char *)st;

    while (*p)
	rval += *p++;
    return(rval);
}

void
crcinit(int method)
{
    int i;

    for (i = 0; i < 256; ++i) {
        int j;
        int v = i;
        hash_t hv = { 0, 0 };

        for (j = 0; j < 8; ++j, (v <<= 1)) {
            if (v & 0x80) {
                hv.h1 ^= POLY1;
                hv.h2 ^= (method == HASH_CRC) ? POLY2 : OPOLY2;
            }
            hv.h2 = (hv.h2 << 1);
            if (hv.h1 & 0x80000000)
                hv.h2 |= 1;
            hv.h1 <<= 1;
        }
        CrcXor[method][i] = hv;
    }
    DidCrcInit = 1;
}

hash_t
hhash(const char *msgid)
{
    hash_t t;

    if (DOpts.HashMethod == HASH_CRC || DOpts.HashMethod == HASH_OCRC) {
	/*
	 * HASH_CRC  - CRC64
	 */
	if (DidCrcInit == 0) {
	    crcinit(HASH_CRC);
	    crcinit(HASH_OCRC);
	}
	t.h1 = HINIT1;
	t.h2 = HINIT2;
	
	while (*msgid) {
	    int i = (t.h1 >> 24) & 255;
	    t.h1 = (t.h1 << 8) ^ (int)((uint32)t.h2 >> 24) ^ CrcXor[DOpts.HashMethod][i].h1;
	    t.h2 = (t.h2 << 8) ^ (uint8)*msgid ^ CrcXor[DOpts.HashMethod][i].h2;
	    ++msgid;
	}
	/*
	 * Fold the generated CRC.  Note, this is buggy but it is too late
	 * for me to change it now.  I should have XOR'd the 1 in, not OR'd
	 * it when folding the bits.
	 */
	if (t.h1 & 0x80000000)
	    t.h1 = (t.h1 & 0x7FFFFFFF) | 1;
	if (t.h2 & 0x80000000)
	    t.h2 = (t.h2 & 0x7FFFFFFF) | 1;
	t.h1 |= 0x80000000;	/* indicate CRC64 hash */
    } else {
	/*
	 * HASH_PRIME
	 */
	int h1 = 0x0034629D;
	int h2 = 0x007395DD;
	int hv = 0x1A3F5C4F;

	while (*msgid) {
	    h1 = h1 * *(const unsigned char *)msgid % P1;
	    h2 = h2 * *(const unsigned char *)msgid % P2;
	    hv = (hv << 5) ^ *(const unsigned char *)msgid ^ (hv >> 23);
	    ++msgid;
	}
	t.h1 = (h1 ^ (hv << 14)) & 0x7FFFFFFF;	/* bit 31 reserved */
	t.h2 = (h2 ^ (hv << 20)) & 0x7FFFFFFF;	/* bit 31 reserved */
    }
    return(t);
}

/*
 * Slightly different and faster hash algorithm to handle message bodies.
 * This is simpler.
 */

void 
bhash(hash_t *h, const char *p, int len)
{
    while (len) {
	h->h1 += *(unsigned char *)p;	/* simple checksum */
	h->h2 = (h->h2 << 5) ^ h->h1 ^ (h->h2 >> 27);
	++p;
	--len;
    }
}

void 
bhash_init(void)
{
    diablo_MD5Init(&BMD5ctx);
}

void 
bhash_update(const char *p, int len)
{
    diablo_MD5Update(&BMD5ctx, p, len);
}

void 
bhash_final(md5hash_t *h)
{
    diablo_MD5Final((unsigned char *)h, &BMD5ctx);
}

/*
 * The digest has to be defined as md5hash_t to get memory alignments
 * right, otherwise we get bus errors.
 */
md5hash_t *
md5hash(const char *st)
{
    static md5hash_t digest;

    diablo_MD5Init(&HMD5ctx);
    diablo_MD5Update(&HMD5ctx, st, strlen(st));
    diablo_MD5Final((unsigned char *)&digest, &HMD5ctx);
    return((md5hash_t *)&digest);
}


char *
md5hashstr(md5hash_t *hash, char *buf)
{
    int i;
    unsigned char *digest;
    static const char hex[] = "0123456789abcdef";

    digest = (unsigned char *)hash;
    for (i = 0; i < 16; i++) {
        buf[i+i] = hex[digest[i] >> 4];
        buf[i+i+1] = hex[digest[i] & 0x0f];
    }
    buf[i+i] = '\0';
    return(buf);
}


/*
 * Use a base32 algorithm to encode the MD5 of a newsgroup name into
 * a filename
 */

static char base32[] = "abcdefghijklmnopqrstuvwxyz012345"
		       "abcdefghijklmnopqrstuvwxyz012345";
static char base64[] = "abcdefghijklmnopqrstuvwxyz0123456789"
		       "ABCDEFGHIJKLMNOPQRSTUVWXYZ+-";

int base_encode(const void *data, int size, char **str, char *base)
{
  static char tmpbuf[32];
  char *p;
  char *s;
  int i;
  int c;
  unsigned char *q;

  p = s = tmpbuf;
  q = (unsigned char*)data;
  i=0;
  for(i = 0; i < size;){
    c=q[i++];
    c*=256;
    if(i < size)
      c+=q[i];
    i++;
    c*=256;
    if(i < size)
      c+=q[i];
    i++;
    p[0]=base[(c&0x00fc0000) >> 18];
    p[1]=base[(c&0x0003f000) >> 12];
    p[2]=base[(c&0x00000fc0) >> 6];
    p[3]=base[(c&0x0000003f) >> 0];
    if(i > size)
      p[3]='=';
    if(i > size+1)
      p[2]='=';
    p+=4;
  }
  *p=0;
  while (*--p == '=')
	*p = 0;
  *str = s;
  return strlen(s);
}

int
GFIndex(char c)
{
    return(index(base64, c) - base64);
}

/*
 * Newsgroup hash types and how the directories are created
 *
 * crc           (default)
 *	This is a simple hhash() of the newsgroup name printed as 2
 *	8 byte hex values separated by a '.'. e.g: f5ca1300.3d9d520d
 *
 * md5-32[:B]/N[/N][/N]
 * md5-32[:B]\N[\N][\N]
 *	This is a 32-bit conversion of a base64 type encoding of the
 *	newsgroup name. The base64 character set is shown above.
 *
 *	The 'B' value indicates the number of bytes of the hash (from the
 *	beginning) to be used for the newsgroup filename
 *	(defaults to all = 22). Lower values can be used to reduce the
 *	filename length and possibly the directory size.
 *
 *	The '/N' values sets the number of directories to create. Multiple
 *	'/N' values can be specified to indicate multiple levels of that
 *	number of directories. The directory name is made by taking the
 *	a hhash() of the hash string, adding the 2 integers of the hash
 *	and applying a % N to it (padding the result with leading '0's
 *	to make it a consistent length. There is no easy way to calculate
 *	the directory name other than to use 'dgrpctl listgrouphashfile'.
 *	
 *	The '\N' value represents the number of characters of the hash to
 *	use for the directory name (max of 9). e.g: for \1\2 on a hash of
 *	'xhpgbjwwlrq2tl1pfq1epq', the directory structure would be:
 *
 *		/news/spool/group/x/hp/
 *
 *	giving 32 directories at the first level and 32*32 = 1024 dirs
 *	at the second level. This gives an easy way to calculate directories
 *	but has a less even spread of groups per directory.
 *
 *	A maximum of 9 directory levels can be created, although no more
 *	than 3 is strictly necessary.
 *
 * md5-64[:B]/N[/N]
 * md5-64[:B]\N[\N]
 *	This is a 64-bit base64 type encoding ('/' converted to a '-').
 *
 *	The 'B' and 'N' values are as the md-32 option above, with the
 *	number of directories for a '\1' directory level set to 64 and
 *	'\2' set to 4096.
 *
 * hierarchy
 *	Use the newsgroup name for the filename and the directory is made
 *	from the '.' separated components of the newsgroup name. This
 *	option can lead to a very uneven spread of newsgroups per directory.
 *
 */

const char *
GFHash(const char *group, GroupHashType *hashtype)
{
    static char GFBuf[PATH_MAX];

    switch (hashtype->gh_type) {
	case HASHGRP_CRC: {
	    hash_t hv = hhash(group);
	    snprintf(GFBuf, hashtype->gh_sigbytes, "%08x.%08x", (uint32)hv.h1, (uint32)hv.h2 );
	    break;
	}
	case HASHGRP_32MD5: case HASHGRP_64MD5: {
	    char buf[34];
	    char *p = buf;
	    unsigned char digest[16];
	    diablo_MD5Init(&HMD5ctx);
	    diablo_MD5Update(&HMD5ctx, group, strlen(group));
	    diablo_MD5Final(digest, &HMD5ctx);
	    base_encode(digest, 16, &p,
		(hashtype->gh_type == HASHGRP_64MD5) ?  base64 : base32);
	    snprintf(GFBuf, hashtype->gh_sigbytes, "%s", p);
	    break;
	}
	case HASHGRP_HIER: {
	    snprintf(GFBuf, hashtype->gh_sigbytes, "%s", group);
	}
    }

    return(GFBuf);
}

/*
 * Hash a string to an integer. Conflicts are not important as this
 * is only used to calculate a directory from a hash
 */
unsigned int
hashNum(char *st)
{
    hash_t h = hhash(st);
    return(abs(h.h1 + h.h2));
}

const char *
GFName(const char *group, int gftype, int artBase, int wantdir, int iter, GroupHashType *method)
{
    static char GFBuf[PATH_MAX];
    char ftype[16];
    char itbuf[16];
    char *hashstr = (char *)GFHash(group, method);


    switch (method->gh_type) {
	case HASHGRP_CRC: {
	    hash_t hv = hhash(group);
	    switch (gftype) {
		case GRPFTYPE_OVER:
		    strcpy(ftype, "over");
		    break;
		case GRPFTYPE_DATA:
		    strcpy(ftype, "data");
		    break;
	    }
	if (iter > 0)
	    snprintf(itbuf, sizeof(itbuf), ".%d", iter);
	else
	    itbuf[0] = 0;
	    if (wantdir)
		snprintf(GFBuf, sizeof(GFBuf), "%02x/%s.%d.%s%s",
						(uint8)hv.h1,
						ftype,
						artBase,
						hashstr,
						itbuf
		);
	    else
		snprintf(GFBuf, sizeof(GFBuf), "%s.%d.%s%s",
						ftype,
						artBase,
						hashstr,
						itbuf
		);
	    break;
	}
	case HASHGRP_32MD5: case HASHGRP_64MD5: {
	    char dirbuf[PATH_MAX];
	    char tmpbuf[22];
	    int i;
	    switch (gftype) {
		case GRPFTYPE_OVER:
		    strcpy(ftype, "o");
		    break;
		case GRPFTYPE_DATA:
		    strcpy(ftype, "d");
		    break;
	    }
	    switch (method->gh_type) {
		case HASHGRP_32MD5:
		    strcat(ftype, ".32");
		    break;
		case HASHGRP_64MD5:
		    strcat(ftype, ".64");
		    break;
	    }
	    dirbuf[0] = 0;
	    for (i = 0; i < method->gh_dirlvl; i++) {
		switch (method->gh_dirtype) {
		    case HASHGRP_DIR_NUM: {
			char format[16];
			int formsize = 0;
			int c = method->gh_dirinfo[i];
			while (c > 0) {
			    formsize++;
			    c = (c - 1) / 10;
			}
			sprintf(format,"%%0%dx", formsize);
			sprintf(tmpbuf, format,
				hashNum(&hashstr[i]) % method->gh_dirinfo[i]);
			if (i > 0)
			    strcat(dirbuf, "/");
			strcat(dirbuf, tmpbuf);
			break;
		    }
		    case HASHGRP_DIR_BIT:
			snprintf(tmpbuf, method->gh_dirinfo[i] + 1, "%s", hashstr);
			if (i > 0)
			    strcat(dirbuf, "/");
			strcat(dirbuf, tmpbuf);
			break;
		}
	    }
	    snprintf(itbuf, sizeof(itbuf), ".%d", iter);
	    if (wantdir)
		snprintf(GFBuf, sizeof(GFBuf), "%s/%s%s.%s.%d",
						dirbuf,
						hashstr,
						itbuf,
						ftype,
						artBase
		);
	    else
		snprintf(GFBuf, sizeof(GFBuf), "%s%s.%s.%d",
						hashstr,
						itbuf,
						ftype,
						artBase
		);
	    break;
	}
	case HASHGRP_HIER: {
	    char groupPath[PATH_MAX];
	    switch (gftype) {
		case GRPFTYPE_OVER:
		    strcpy(ftype, "o");
		    break;
		case GRPFTYPE_DATA:
		    strcpy(ftype, "d");
		    break;
	    }
	    if (wantdir) {
		FindGroupDir(group, groupPath);
		snprintf(GFBuf, sizeof(GFBuf), "%s%s.%d.%s",
						groupPath,
						ftype,
						artBase,
						hashstr
		);
	    } else {
		snprintf(GFBuf, sizeof(GFBuf), "%s.%d.%s",
						ftype,
						artBase,
						hashstr
		);
	    }
	    break;
	}
    }
    if (DebugOpt > 4)
	printf("GFName:%s\n", GFBuf);
    return(GFBuf);
}

void
SetCacheHash(struct CacheHash_t* ch, const char *group, int iter, GroupHashType *method)
{
    ch->iter = iter;
    switch (method->gh_type) {
	case HASHGRP_CRC: {
	    hash_t hv = hhash(group);
	    ch->h1 = (uint64_t) hv.h1;
	    ch->h2 = (uint64_t) hv.h2;
	    break;
	}
	case HASHGRP_32MD5:
	case HASHGRP_64MD5:
	case HASHGRP_HIER: { /* we just hope there will be no collision */
	    md5hash_t *hv = md5hash(group);
	    ch->h1 = hv->h1;
	    ch->h2 = hv->h2;
	    break;
	}
    }
}

void
FindGroupDir(const char *group, char *path)
{
    const char *p = group;
    const char *q = p;

    *path = 0;
    while ((p = strchr(p, '.')) != NULL) {
	if (p - q <= 1) {
	    p++;
	    continue;
	}
	strncat(path, q, p - q);
	strcat(path, "/");
	p++;
	q = p;
    }    
}

/*
 * Given a hashed newsgroup header filename, extract the components
 * and return hash type
 */
int
ExtractGroupHashInfo(const char *name, char *Hash, long *artBase, int *iter)
{
    int b;

    /*
     * This is pure evil because we have too many overview file
     * formats to support. We start with what is likely to be
     * the most common. It gets even messier because random values
     * can be assigned to variables, even when the whole string doesn't
     * match.
     */

    /*
     * HASHGRP_32MD5 or HASHGRP_64MD5
     */
    if (sscanf(name, "%[^.].%d.d.%d.%ld", Hash, iter, &b, artBase) == 4)
	return(((b == 64) ? HASHGRP_64MD5 : HASHGRP_32MD5) | HASHGRPTYPE_DATA);
    *iter = 0;
    if (sscanf(name, "%[^.].%d.o.%d.%ld", Hash, iter, &b, artBase) == 4)
	return(((b == 64) ? HASHGRP_64MD5 : HASHGRP_32MD5) | HASHGRPTYPE_OVER);
    *iter = 0;

    /*
     * HASHGRP_CRC with iteration
     */
    if (sscanf(name, "over.%ld.%17[0-9a-f.].%d", artBase, Hash, iter) == 3)
	return(HASHGRP_CRC | HASHGRPTYPE_OVER);
    *iter = 0;
    if (sscanf(name, "data.%ld.%17[0-9a-f.].%d", artBase, Hash, iter) == 3)
	return(HASHGRP_CRC | HASHGRPTYPE_DATA);
    *iter = 0;

    /*
     * HASHGRP_CRC
     */
    if (sscanf(name, "over.%ld.%17[0-9a-f.]", artBase, Hash) == 2)
	return(HASHGRP_CRC | HASHGRPTYPE_OVER);
    *iter = 0;
    if (sscanf(name, "data.%ld.%17[0-9a-f.]", artBase, Hash) == 2)
	return(HASHGRP_CRC | HASHGRPTYPE_DATA);
    *iter = 0;

    /*
     * HASHGRP_HIER
     */
    if (sscanf(name, "o.%ld.%[^/]", artBase, Hash) == 2)
	return(HASHGRP_HIER | HASHGRPTYPE_OVER);
    *iter = 0;
    if (sscanf(name, "d.%ld.%[^/]", artBase, Hash) == 2)
	return(HASHGRP_HIER | HASHGRPTYPE_DATA);
    *iter = 0;
    return(HASHGRP_NONE);
}

char *
GetHash(GroupHashType *Hash, char *hashStr)
{
    int i;
    char ch;
    char numSt[16];

    switch (Hash->gh_type) {
	case HASHGRP_NONE:
	    strcpy(hashStr, "NONE");
	    break;
	case HASHGRP_CRC:
	    strcpy(hashStr, "crc");
	    break;
	case HASHGRP_32MD5:
	    strcpy(hashStr, "md5-32");
	    break;
	case HASHGRP_64MD5:
	    strcpy(hashStr, "md5-64");
	    break;
	case HASHGRP_HIER:
	    strcpy(hashStr, "hierarchy");
	    break;
	default:
	    strcpy(hashStr, "UNKNOWN\n");
    }
    sprintf(numSt, ":%d", Hash->gh_sigbytes);
    strcat(hashStr, numSt);
    switch (Hash->gh_dirtype) {
	case HASHGRP_DIR_NUM:
	    ch = '/';
	    break;
	case HASHGRP_DIR_BIT:
	    ch = '\\';
	    break;
	default:
	    ch = '?';
    }
    for (i = 0; i < Hash->gh_dirlvl; i++) {
	sprintf(numSt, "%c%d", ch, Hash->gh_dirinfo[i]);
	strcat(hashStr, numSt);
    }
    return(hashStr);
}

#ifdef TEST

int
main(int ac, char **av)
{
    int i;

    for (i = 1; i < ac; ++i) {
	hash_t hv = hhash(av[i]);
	printf("%08x.%08x\t%s\n", hv.h1, hv.h2, av[i]);
    }
    return(0);
}

#endif



syntax highlighted by Code2HTML, v. 0.9.1