/* * LIB/HASH.C * * (c)Copyright 1997, Matthew Dillon, All Rights Reserved. Refer to * the COPYRIGHT file in the base directory of this distribution * for specific rights granted. * * base32 encode is based on code that is * Copyright (c) 1995, 1996, 1997 Kungliga Tekniska Hgskolan * (Royal Institute of Technology, Stockholm, Sweden). * All rights reserved. * * CRC hash generator * * NOTE! If using this CRC code in other projects, please fix the folding * code (see 'folding' below). The '| 1' should be '^ 1'. It cannot be * changed in Diablo, unfortunately, without putting everying through the * wringer again which I do not want to do. */ #include "defs.h" Prototype int quickhash(const char *st); Prototype hash_t hhash(const char *msgid); Prototype void bhash_init(void); Prototype void bhash_update(const char *p, int len); Prototype void bhash_final(md5hash_t *h); Prototype md5hash_t *md5hash(const char *st); Prototype char *md5hashstr(md5hash_t *hash, char *buf); Prototype void bhash(hash_t *h, const char *p, int len); Prototype int GFIndex(char c); Prototype const char *GFHash(const char *group, GroupHashType *method); Prototype const char *GFName(const char *group, int ftype, int artBase, int wantdir, int iter, GroupHashType *method); void FindGroupDir(const char *group, char *path); Prototype int ExtractGroupHashInfo(const char *name, char *Hash, long *artBase, int *iter); Prototype char *GetHash(GroupHashType *Hash, char *hashStr); Prototype void SetCacheHash(struct CacheHash_t* ch, const char *group, int iter, GroupHashType *method); #define P1 7834891 #define P2 6017489 #define HINIT1 0xFAC432B1 #define HINIT2 0x0CD5E44A #define POLY1 0x00600340UL #define POLY2 0x00F0D50BUL #define OPOLY2 0x00F0D50AUL #ifdef TEST int DOpts.HashMethod = HASH_CRC; #endif hash_t CrcXor[3][256]; int DidCrcInit; struct diablo_MD5Context HMD5ctx; struct diablo_MD5Context BMD5ctx; /* article body MD5 context */ int quickhash(const char *st) { register int rval = 0; register unsigned char *p = (unsigned char *)st; while (*p) rval += *p++; return(rval); } void crcinit(int method) { int i; for (i = 0; i < 256; ++i) { int j; int v = i; hash_t hv = { 0, 0 }; for (j = 0; j < 8; ++j, (v <<= 1)) { if (v & 0x80) { hv.h1 ^= POLY1; hv.h2 ^= (method == HASH_CRC) ? POLY2 : OPOLY2; } hv.h2 = (hv.h2 << 1); if (hv.h1 & 0x80000000) hv.h2 |= 1; hv.h1 <<= 1; } CrcXor[method][i] = hv; } DidCrcInit = 1; } hash_t hhash(const char *msgid) { hash_t t; if (DOpts.HashMethod == HASH_CRC || DOpts.HashMethod == HASH_OCRC) { /* * HASH_CRC - CRC64 */ if (DidCrcInit == 0) { crcinit(HASH_CRC); crcinit(HASH_OCRC); } t.h1 = HINIT1; t.h2 = HINIT2; while (*msgid) { int i = (t.h1 >> 24) & 255; t.h1 = (t.h1 << 8) ^ (int)((uint32)t.h2 >> 24) ^ CrcXor[DOpts.HashMethod][i].h1; t.h2 = (t.h2 << 8) ^ (uint8)*msgid ^ CrcXor[DOpts.HashMethod][i].h2; ++msgid; } /* * Fold the generated CRC. Note, this is buggy but it is too late * for me to change it now. I should have XOR'd the 1 in, not OR'd * it when folding the bits. */ if (t.h1 & 0x80000000) t.h1 = (t.h1 & 0x7FFFFFFF) | 1; if (t.h2 & 0x80000000) t.h2 = (t.h2 & 0x7FFFFFFF) | 1; t.h1 |= 0x80000000; /* indicate CRC64 hash */ } else { /* * HASH_PRIME */ int h1 = 0x0034629D; int h2 = 0x007395DD; int hv = 0x1A3F5C4F; while (*msgid) { h1 = h1 * *(const unsigned char *)msgid % P1; h2 = h2 * *(const unsigned char *)msgid % P2; hv = (hv << 5) ^ *(const unsigned char *)msgid ^ (hv >> 23); ++msgid; } t.h1 = (h1 ^ (hv << 14)) & 0x7FFFFFFF; /* bit 31 reserved */ t.h2 = (h2 ^ (hv << 20)) & 0x7FFFFFFF; /* bit 31 reserved */ } return(t); } /* * Slightly different and faster hash algorithm to handle message bodies. * This is simpler. */ void bhash(hash_t *h, const char *p, int len) { while (len) { h->h1 += *(unsigned char *)p; /* simple checksum */ h->h2 = (h->h2 << 5) ^ h->h1 ^ (h->h2 >> 27); ++p; --len; } } void bhash_init(void) { diablo_MD5Init(&BMD5ctx); } void bhash_update(const char *p, int len) { diablo_MD5Update(&BMD5ctx, p, len); } void bhash_final(md5hash_t *h) { diablo_MD5Final((unsigned char *)h, &BMD5ctx); } /* * The digest has to be defined as md5hash_t to get memory alignments * right, otherwise we get bus errors. */ md5hash_t * md5hash(const char *st) { static md5hash_t digest; diablo_MD5Init(&HMD5ctx); diablo_MD5Update(&HMD5ctx, st, strlen(st)); diablo_MD5Final((unsigned char *)&digest, &HMD5ctx); return((md5hash_t *)&digest); } char * md5hashstr(md5hash_t *hash, char *buf) { int i; unsigned char *digest; static const char hex[] = "0123456789abcdef"; digest = (unsigned char *)hash; for (i = 0; i < 16; i++) { buf[i+i] = hex[digest[i] >> 4]; buf[i+i+1] = hex[digest[i] & 0x0f]; } buf[i+i] = '\0'; return(buf); } /* * Use a base32 algorithm to encode the MD5 of a newsgroup name into * a filename */ static char base32[] = "abcdefghijklmnopqrstuvwxyz012345" "abcdefghijklmnopqrstuvwxyz012345"; static char base64[] = "abcdefghijklmnopqrstuvwxyz0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ+-"; int base_encode(const void *data, int size, char **str, char *base) { static char tmpbuf[32]; char *p; char *s; int i; int c; unsigned char *q; p = s = tmpbuf; q = (unsigned char*)data; i=0; for(i = 0; i < size;){ c=q[i++]; c*=256; if(i < size) c+=q[i]; i++; c*=256; if(i < size) c+=q[i]; i++; p[0]=base[(c&0x00fc0000) >> 18]; p[1]=base[(c&0x0003f000) >> 12]; p[2]=base[(c&0x00000fc0) >> 6]; p[3]=base[(c&0x0000003f) >> 0]; if(i > size) p[3]='='; if(i > size+1) p[2]='='; p+=4; } *p=0; while (*--p == '=') *p = 0; *str = s; return strlen(s); } int GFIndex(char c) { return(index(base64, c) - base64); } /* * Newsgroup hash types and how the directories are created * * crc (default) * This is a simple hhash() of the newsgroup name printed as 2 * 8 byte hex values separated by a '.'. e.g: f5ca1300.3d9d520d * * md5-32[:B]/N[/N][/N] * md5-32[:B]\N[\N][\N] * This is a 32-bit conversion of a base64 type encoding of the * newsgroup name. The base64 character set is shown above. * * The 'B' value indicates the number of bytes of the hash (from the * beginning) to be used for the newsgroup filename * (defaults to all = 22). Lower values can be used to reduce the * filename length and possibly the directory size. * * The '/N' values sets the number of directories to create. Multiple * '/N' values can be specified to indicate multiple levels of that * number of directories. The directory name is made by taking the * a hhash() of the hash string, adding the 2 integers of the hash * and applying a % N to it (padding the result with leading '0's * to make it a consistent length. There is no easy way to calculate * the directory name other than to use 'dgrpctl listgrouphashfile'. * * The '\N' value represents the number of characters of the hash to * use for the directory name (max of 9). e.g: for \1\2 on a hash of * 'xhpgbjwwlrq2tl1pfq1epq', the directory structure would be: * * /news/spool/group/x/hp/ * * giving 32 directories at the first level and 32*32 = 1024 dirs * at the second level. This gives an easy way to calculate directories * but has a less even spread of groups per directory. * * A maximum of 9 directory levels can be created, although no more * than 3 is strictly necessary. * * md5-64[:B]/N[/N] * md5-64[:B]\N[\N] * This is a 64-bit base64 type encoding ('/' converted to a '-'). * * The 'B' and 'N' values are as the md-32 option above, with the * number of directories for a '\1' directory level set to 64 and * '\2' set to 4096. * * hierarchy * Use the newsgroup name for the filename and the directory is made * from the '.' separated components of the newsgroup name. This * option can lead to a very uneven spread of newsgroups per directory. * */ const char * GFHash(const char *group, GroupHashType *hashtype) { static char GFBuf[PATH_MAX]; switch (hashtype->gh_type) { case HASHGRP_CRC: { hash_t hv = hhash(group); snprintf(GFBuf, hashtype->gh_sigbytes, "%08x.%08x", (uint32)hv.h1, (uint32)hv.h2 ); break; } case HASHGRP_32MD5: case HASHGRP_64MD5: { char buf[34]; char *p = buf; unsigned char digest[16]; diablo_MD5Init(&HMD5ctx); diablo_MD5Update(&HMD5ctx, group, strlen(group)); diablo_MD5Final(digest, &HMD5ctx); base_encode(digest, 16, &p, (hashtype->gh_type == HASHGRP_64MD5) ? base64 : base32); snprintf(GFBuf, hashtype->gh_sigbytes, "%s", p); break; } case HASHGRP_HIER: { snprintf(GFBuf, hashtype->gh_sigbytes, "%s", group); } } return(GFBuf); } /* * Hash a string to an integer. Conflicts are not important as this * is only used to calculate a directory from a hash */ unsigned int hashNum(char *st) { hash_t h = hhash(st); return(abs(h.h1 + h.h2)); } const char * GFName(const char *group, int gftype, int artBase, int wantdir, int iter, GroupHashType *method) { static char GFBuf[PATH_MAX]; char ftype[16]; char itbuf[16]; char *hashstr = (char *)GFHash(group, method); switch (method->gh_type) { case HASHGRP_CRC: { hash_t hv = hhash(group); switch (gftype) { case GRPFTYPE_OVER: strcpy(ftype, "over"); break; case GRPFTYPE_DATA: strcpy(ftype, "data"); break; } if (iter > 0) snprintf(itbuf, sizeof(itbuf), ".%d", iter); else itbuf[0] = 0; if (wantdir) snprintf(GFBuf, sizeof(GFBuf), "%02x/%s.%d.%s%s", (uint8)hv.h1, ftype, artBase, hashstr, itbuf ); else snprintf(GFBuf, sizeof(GFBuf), "%s.%d.%s%s", ftype, artBase, hashstr, itbuf ); break; } case HASHGRP_32MD5: case HASHGRP_64MD5: { char dirbuf[PATH_MAX]; char tmpbuf[22]; int i; switch (gftype) { case GRPFTYPE_OVER: strcpy(ftype, "o"); break; case GRPFTYPE_DATA: strcpy(ftype, "d"); break; } switch (method->gh_type) { case HASHGRP_32MD5: strcat(ftype, ".32"); break; case HASHGRP_64MD5: strcat(ftype, ".64"); break; } dirbuf[0] = 0; for (i = 0; i < method->gh_dirlvl; i++) { switch (method->gh_dirtype) { case HASHGRP_DIR_NUM: { char format[16]; int formsize = 0; int c = method->gh_dirinfo[i]; while (c > 0) { formsize++; c = (c - 1) / 10; } sprintf(format,"%%0%dx", formsize); sprintf(tmpbuf, format, hashNum(&hashstr[i]) % method->gh_dirinfo[i]); if (i > 0) strcat(dirbuf, "/"); strcat(dirbuf, tmpbuf); break; } case HASHGRP_DIR_BIT: snprintf(tmpbuf, method->gh_dirinfo[i] + 1, "%s", hashstr); if (i > 0) strcat(dirbuf, "/"); strcat(dirbuf, tmpbuf); break; } } snprintf(itbuf, sizeof(itbuf), ".%d", iter); if (wantdir) snprintf(GFBuf, sizeof(GFBuf), "%s/%s%s.%s.%d", dirbuf, hashstr, itbuf, ftype, artBase ); else snprintf(GFBuf, sizeof(GFBuf), "%s%s.%s.%d", hashstr, itbuf, ftype, artBase ); break; } case HASHGRP_HIER: { char groupPath[PATH_MAX]; switch (gftype) { case GRPFTYPE_OVER: strcpy(ftype, "o"); break; case GRPFTYPE_DATA: strcpy(ftype, "d"); break; } if (wantdir) { FindGroupDir(group, groupPath); snprintf(GFBuf, sizeof(GFBuf), "%s%s.%d.%s", groupPath, ftype, artBase, hashstr ); } else { snprintf(GFBuf, sizeof(GFBuf), "%s.%d.%s", ftype, artBase, hashstr ); } break; } } if (DebugOpt > 4) printf("GFName:%s\n", GFBuf); return(GFBuf); } void SetCacheHash(struct CacheHash_t* ch, const char *group, int iter, GroupHashType *method) { ch->iter = iter; switch (method->gh_type) { case HASHGRP_CRC: { hash_t hv = hhash(group); ch->h1 = (uint64_t) hv.h1; ch->h2 = (uint64_t) hv.h2; break; } case HASHGRP_32MD5: case HASHGRP_64MD5: case HASHGRP_HIER: { /* we just hope there will be no collision */ md5hash_t *hv = md5hash(group); ch->h1 = hv->h1; ch->h2 = hv->h2; break; } } } void FindGroupDir(const char *group, char *path) { const char *p = group; const char *q = p; *path = 0; while ((p = strchr(p, '.')) != NULL) { if (p - q <= 1) { p++; continue; } strncat(path, q, p - q); strcat(path, "/"); p++; q = p; } } /* * Given a hashed newsgroup header filename, extract the components * and return hash type */ int ExtractGroupHashInfo(const char *name, char *Hash, long *artBase, int *iter) { int b; /* * This is pure evil because we have too many overview file * formats to support. We start with what is likely to be * the most common. It gets even messier because random values * can be assigned to variables, even when the whole string doesn't * match. */ /* * HASHGRP_32MD5 or HASHGRP_64MD5 */ if (sscanf(name, "%[^.].%d.d.%d.%ld", Hash, iter, &b, artBase) == 4) return(((b == 64) ? HASHGRP_64MD5 : HASHGRP_32MD5) | HASHGRPTYPE_DATA); *iter = 0; if (sscanf(name, "%[^.].%d.o.%d.%ld", Hash, iter, &b, artBase) == 4) return(((b == 64) ? HASHGRP_64MD5 : HASHGRP_32MD5) | HASHGRPTYPE_OVER); *iter = 0; /* * HASHGRP_CRC with iteration */ if (sscanf(name, "over.%ld.%17[0-9a-f.].%d", artBase, Hash, iter) == 3) return(HASHGRP_CRC | HASHGRPTYPE_OVER); *iter = 0; if (sscanf(name, "data.%ld.%17[0-9a-f.].%d", artBase, Hash, iter) == 3) return(HASHGRP_CRC | HASHGRPTYPE_DATA); *iter = 0; /* * HASHGRP_CRC */ if (sscanf(name, "over.%ld.%17[0-9a-f.]", artBase, Hash) == 2) return(HASHGRP_CRC | HASHGRPTYPE_OVER); *iter = 0; if (sscanf(name, "data.%ld.%17[0-9a-f.]", artBase, Hash) == 2) return(HASHGRP_CRC | HASHGRPTYPE_DATA); *iter = 0; /* * HASHGRP_HIER */ if (sscanf(name, "o.%ld.%[^/]", artBase, Hash) == 2) return(HASHGRP_HIER | HASHGRPTYPE_OVER); *iter = 0; if (sscanf(name, "d.%ld.%[^/]", artBase, Hash) == 2) return(HASHGRP_HIER | HASHGRPTYPE_DATA); *iter = 0; return(HASHGRP_NONE); } char * GetHash(GroupHashType *Hash, char *hashStr) { int i; char ch; char numSt[16]; switch (Hash->gh_type) { case HASHGRP_NONE: strcpy(hashStr, "NONE"); break; case HASHGRP_CRC: strcpy(hashStr, "crc"); break; case HASHGRP_32MD5: strcpy(hashStr, "md5-32"); break; case HASHGRP_64MD5: strcpy(hashStr, "md5-64"); break; case HASHGRP_HIER: strcpy(hashStr, "hierarchy"); break; default: strcpy(hashStr, "UNKNOWN\n"); } sprintf(numSt, ":%d", Hash->gh_sigbytes); strcat(hashStr, numSt); switch (Hash->gh_dirtype) { case HASHGRP_DIR_NUM: ch = '/'; break; case HASHGRP_DIR_BIT: ch = '\\'; break; default: ch = '?'; } for (i = 0; i < Hash->gh_dirlvl; i++) { sprintf(numSt, "%c%d", ch, Hash->gh_dirinfo[i]); strcat(hashStr, numSt); } return(hashStr); } #ifdef TEST int main(int ac, char **av) { int i; for (i = 1; i < ac; ++i) { hash_t hv = hhash(av[i]); printf("%08x.%08x\t%s\n", hv.h1, hv.h2, av[i]); } return(0); } #endif