/* ------------------------------------------------------------------------- A noncryptographic hash function producing up to 128 bit results. There are three interfaces: * hash(), which just hashes a byte array * keyhash(), which hashes a byte array plus a key * init(), update(), final(), which let a message be hashed in pieces The all produce the same values. This hash is pretty specific to little-endian machines with SSE instructions, that is, recent x86 compatible chips. For large messages it runs at about 3 bytes/cycle. This implements 3 separate hashes: one for inputs 0..15 bytes long, one for 16..LARGE, and one beyond that. The 16..LARGE uses 16-byte blocks and a 16-byte internal state. It xors the ith block to the state at steps 2i, 2(i+6-4(i%2))+1, and 2(i+10)+1. The macro CHURN() does two steps. One step adds the state to itself a left shifted, the other shuffles the state and xors it to itself right shifted. Once all uses of all blocks have been made, the 16-byte internal state is mixed until every output bit is affected. The 0..15 hash is just that final mixing of a 16-byte state. The LARGE+ hash is four independent copies of the 16..LARGE hash (operating on 16*4=64 byte chunks), with a different final mixing that mixes 4 16-byte states together until one 16-byte state has all output bits affected. It's basically a space-time tradeoff. By keeping a huge state so I can use each block three times at staggered intervals, I can get away with doing about two shifts per 16-byte block. My cryptographic hash function Maraca, and the way it was broken, were an exploration of that tradeoff. By Bob Jenkins, Jan 1 2009, public domain ------------------------------------------------------------------------- */ #include #include #include #include "zorba.h" #define LARGE 766 #define xor(a,b) _mm_xor_si128(a, b) #define add(a,b) _mm_add_epi64(a, b) #define shuf(a,b) _mm_shuffle_epi32(a, b) #define lshift(a,b) _mm_slli_epi64(a, b) #define rshift(a,b) _mm_srli_epi64(a, b) #define read(x) _mm_load_si128((void *)x) /* churn a state */ #define CHURN(s,first,second,third) \ { \ second=xor(first,second); \ s = xor(shuf(s,0x39),xor(rshift(s,5),first)); \ s = xor(add(lshift(s,8),s),third); \ } /* churn s, with second and third use of a block */ #define TAIL1(s,third) \ { \ s = xor(shuf(s,0x39), rshift(s,5)); \ s = add(s, xor(lshift(s,8), third)); \ } /* strength: I had to test this without the first or last line */ #define FINAL1(s) \ { \ s = xor(shuf(s,0x1b), rshift(s,1)); \ s = add(s, lshift(s,17)); \ s = xor(shuf(s,0x1b), rshift(s,13)); \ s = add(s, lshift(s,8)); \ s = xor(shuf(s,0x1b), rshift(s,2)); \ s = add(s, lshift(s,28)); \ s = xor(shuf(s,0x1b), rshift(s,16)); \ s = add(s, lshift(s,4)); \ s = xor(shuf(s,0x1b), rshift(s,6)); \ s = add(s, lshift(s,9)); \ } #define FINAL4(s) \ { \ s1 = xor(shuf(s1,0x1b), s2); \ s3 = xor(shuf(s3,0x1b), s0); \ s2 = add(shuf(s2,0x39), lshift(s1,6)); \ s0 = add(shuf(s0,0x39), lshift(s3,6)); \ s1 = xor(shuf(s1,0x1b), s0); \ s2 = xor(shuf(s2,0x1b), s3); \ s3 = add(shuf(s3,0x39), lshift(s2,19)); \ s0 = add(shuf(s0,0x39), lshift(s1,19)); \ s1 = xor(shuf(s1,0x1b), s3); \ s2 = xor(shuf(s2,0x1b), s0); \ s3 = add(shuf(s3,0x39), lshift(s1,9)); \ s0 = add(shuf(s0,0x39), lshift(s2,9)); \ s1 = xor(shuf(s1,0x1b), s2); \ s3 = xor(shuf(s3,0x1b), s0); \ s0 = add(shuf(s0,0x39), lshift(s3,5)); \ s1 = xor(shuf(s1,0x1b), s0); \ s0 = add(shuf(s0,0x39), s1); \ } /* churn four independent states */ #define CHURN4(data,i,q,s,x,y,z) \ { \ x##0 = read(&data[(i)+(q) ]); \ x##1 = read(&data[(i)+(q)+1]); \ x##2 = read(&data[(i)+(q)+2]); \ x##3 = read(&data[(i)+(q)+3]); \ CHURN(s##0,x##0,y##0,z##0); \ CHURN(s##1,x##1,y##1,z##1); \ CHURN(s##2,x##2,y##2,z##2); \ CHURN(s##3,x##3,y##3,z##3); \ } /* churn one independent state */ #define CHURN1(data,i,q,s,x,y,z) \ { \ x = read(&data[(i)+(q)]); \ CHURN(s,x,y,z); \ } #define TAIL4(s,third) { \ TAIL1(s##0,third##0); \ TAIL1(s##1,third##1); \ TAIL1(s##2,third##2); \ TAIL1(s##3,third##3); \ } #define TO_REG(a,m,i) { \ a##0 = m[i ].h; \ a##1 = m[i+1].h; \ a##2 = m[i+2].h; \ a##3 = m[i+3].h; \ } #define FROM_REG(a,m,i) { \ m[i ].h = a##0; \ m[i+1].h = a##1; \ m[i+2].h = a##2; \ m[i+3].h = a##3; \ } #define REG_REG(a,b) { \ a##0 = b##0; \ a##1 = b##1; \ a##2 = b##2; \ a##3 = b##3; \ } /* initialize a zorba state */ void init(zorba *z) { memset(z->s, 0x55, sizeof(z->s)); memset(z->accum, 0, sizeof(z->accum)); z->datalen = 0; z->messagelen = (u8)0; } /* hash a piece of a message */ void update(zorba *z, const void *data, size_t len) { int counter; int oldlen = z->datalen; size_t total = oldlen + len; __m128i s0,a0,b0,c0,d0,e0,f0,g0,h0,i0,j0,k0,l0; __m128i s1,a1,b1,c1,d1,e1,f1,g1,h1,i1,j1,k1,l1; __m128i s2,a2,b2,c2,d2,e2,f2,g2,h2,i2,j2,k2,l2; __m128i s3,a3,b3,c3,d3,e3,f3,g3,h3,i3,j3,k3,l3; /* exit early if we don't have a complete block */ z->messagelen += len; if (total < BUFFERED) { memcpy(((char *)z->data)+oldlen, data, len); z->datalen = total; return; } /* load the registers */ TO_REG(s,z->s,0); TO_REG(l,z->accum,0); TO_REG(k,z->accum,4); TO_REG(j,z->accum,8); TO_REG(i,z->accum,12); TO_REG(h,z->accum,16); TO_REG(g,z->accum,20); TO_REG(f,z->accum,24); TO_REG(e,z->accum,28); TO_REG(d,z->accum,32); TO_REG(c,z->accum,36); /* use the first cached block, if any */ if (oldlen) { __m128i *cache = (__m128i *)z->data; int piece = BUFFERED-oldlen; memcpy(((char *)cache)+oldlen, data, piece); CHURN4(cache,0, 0,s,b,h,l); CHURN4(cache,0, 4,s,a,c,k); CHURN4(cache,0, 8,s,l,f,j); CHURN4(cache,0,12,s,k,a,i); CHURN4(cache,0,16,s,j,d,h); CHURN4(cache,0,20,s,i,k,g); CHURN4(cache,0,24,s,h,b,f); CHURN4(cache,0,28,s,g,i,e); CHURN4(cache,0,32,s,f,l,d); CHURN4(cache,0,36,s,e,g,c); CHURN4(cache,0,40,s,d,j,b); CHURN4(cache,0,44,s,c,e,a); len -= piece; } /* use any other complete blocks */ if ((((size_t)data) & 15) == 0) { const __m128i *aligned_data = (const __m128i *)data; int len2 = len/16; for (counter=BLOCK; counter<=len2; counter+=BLOCK) { CHURN4(aligned_data,counter,-48,s,b,h,l); CHURN4(aligned_data,counter,-44,s,a,i,k); CHURN4(aligned_data,counter,-40,s,l,f,j); CHURN4(aligned_data,counter,-36,s,k,g,i); CHURN4(aligned_data,counter,-32,s,j,d,h); CHURN4(aligned_data,counter,-28,s,i,e,g); CHURN4(aligned_data,counter,-24,s,h,b,f); CHURN4(aligned_data,counter,-20,s,g,c,e); CHURN4(aligned_data,counter,-16,s,f,l,d); CHURN4(aligned_data,counter,-12,s,e,a,c); CHURN4(aligned_data,counter, -8,s,d,j,b); CHURN4(aligned_data,counter, -4,s,c,k,a); } counter *= 16; } else { for (counter=BUFFERED; counter<=len; counter+=BUFFERED) { __m128i *cache = (__m128i *)z->data; memcpy(cache, ((const char *)data)+(counter-BUFFERED), BUFFERED); CHURN4(cache,0, 0,s,b,h,l); CHURN4(cache,0, 4,s,a,i,k); CHURN4(cache,0, 8,s,l,f,j); CHURN4(cache,0,12,s,k,g,i); CHURN4(cache,0,16,s,j,d,h); CHURN4(cache,0,20,s,i,e,g); CHURN4(cache,0,24,s,h,b,f); CHURN4(cache,0,28,s,g,c,e); CHURN4(cache,0,32,s,f,l,d); CHURN4(cache,0,36,s,e,a,c); CHURN4(cache,0,40,s,d,j,b); CHURN4(cache,0,44,s,c,k,a); } } /* store the registers */ FROM_REG(s,z->s,0); FROM_REG(l,z->accum,0); FROM_REG(k,z->accum,4); FROM_REG(j,z->accum,8); FROM_REG(i,z->accum,12); FROM_REG(h,z->accum,16); FROM_REG(g,z->accum,20); FROM_REG(f,z->accum,24); FROM_REG(e,z->accum,28); FROM_REG(d,z->accum,32); FROM_REG(c,z->accum,36); /* cache the last partial block, if any */ len = BUFFERED + len - counter; if (len) { memcpy(z->data, ((const char *)data)+counter-BUFFERED, len); z->datalen = len; } } /* midhash assumes the message is aligned to a 16-byte boundary */ z128 midhash( const void *message, size_t mlen, const void *key, size_t klen) { __m128i s,a,b,c,d,e,f,g,h,i,j,k,l; __m128i *cache = (__m128i *)message; z128 val; int total = mlen + klen; int counter; if (klen > 16) { printf("key length must be 16 bytes or less\n"); exit(1); } /* add the key and length; pad to a multiple of 16 bytes */ memcpy(((char *)cache)+mlen, key, klen); ((char *)cache)[total] = klen; ((char *)cache)[total+1] = mlen+1; total += 2; if (total&15) { int tail = 16 - (total&15); memset(((char *)cache)+total, 0, tail); total += tail; } /* hash 32-byte blocks */ s = _mm_set_epi32(0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef); c=d=e=f=g=h=i=j=k=l=s; for (counter=2; counter<=total/16; counter+=2) { CHURN1(cache,counter,-2,s,b,h,l); CHURN1(cache,counter,-1,s,a,i,k); l=j; k=i; j=h; i=g; h=f; g=e; f=d; e=c; d=b; c=a; } /* possibly handle trailing 16-byte block, then use up accumulators */ if (counter != total/16) { CHURN1(cache,counter,-2,s,b,h,l); TAIL1(s,k); TAIL1(s,j); TAIL1(s,i); TAIL1(s,h); TAIL1(s,g); TAIL1(s,f); TAIL1(s,e); TAIL1(s,d); TAIL1(s,c); TAIL1(s,b); } else { TAIL1(s,l); TAIL1(s,k); TAIL1(s,j); TAIL1(s,i); TAIL1(s,h); TAIL1(s,g); TAIL1(s,f); TAIL1(s,e); TAIL1(s,d); TAIL1(s,c); } /* final mixing */ FINAL1(s); val.h = s; return val; } /* * Compute a hash for the total message * Computing the hash and using the key are done only privately * This: init() update() final(key1) final(key2) final(key3) * is the same as: init() update() final(key3) */ z128 final(zorba *z, const void *key, size_t klen) { u8 total; unsigned short lengths; unsigned short int field; z128 val; /* use the key and the lengths */ total = z->messagelen + klen; if (total <= 15) { /* one state, just finalization */ val.h = _mm_set_epi32(0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef); memcpy(((char *)val.x), (char *)z->data, z->datalen); memcpy(((char *)val.x)+z->datalen, key, klen); ((char *)val.x)[15] = 1 + z->datalen + (klen << 4); FINAL1(val.h); return val; } else if (total <= LARGE) { return midhash(z->data, z->datalen, key, klen); } else { /* four states, churn in parallel, finalization mixes those 4 states */ __m128i s0,a0,b0,c0,d0,e0,f0,g0,h0,i0,j0,k0,l0; __m128i s1,a1,b1,c1,d1,e1,f1,g1,h1,i1,j1,k1,l1; __m128i s2,a2,b2,c2,d2,e2,f2,g2,h2,i2,j2,k2,l2; __m128i s3,a3,b3,c3,d3,e3,f3,g3,h3,i3,j3,k3,l3; __m128i *cache = (__m128i *)z->data; int counter; if (klen > 16) { printf("key length must be 16 bytes or less\n"); exit(1); } total = z->datalen + klen; /* load the registers */ TO_REG(s,z->s,0); TO_REG(l,z->accum,0); TO_REG(k,z->accum,4); TO_REG(j,z->accum,8); TO_REG(i,z->accum,12); TO_REG(h,z->accum,16); TO_REG(g,z->accum,20); TO_REG(f,z->accum,24); TO_REG(e,z->accum,28); TO_REG(d,z->accum,32); TO_REG(c,z->accum,36); /* add the key and length; pad to a multiple of 64 bytes */ memcpy(((char *)z->data)+z->datalen, key, klen); ((char *)z->data)[total] = klen; ((char *)z->data)[total+1] = (z->datalen&63)+1; total += 2; if (total&63) { int tail = 64 - (total&63); memset(((char *)z->data)+total, 0, tail); total += tail; } /* consume all remaining 128-byte blocks */ for (counter=8; counter<=total/16; counter += 8) { CHURN4(cache,counter,-8,s,b,h,l); CHURN4(cache,counter,-4,s,a,i,k); REG_REG(l,j); REG_REG(k,i); REG_REG(j,h); REG_REG(i,g); REG_REG(h,f); REG_REG(g,e); REG_REG(f,d); REG_REG(e,c); REG_REG(d,b); REG_REG(c,a); } /* possibly another 64-byte chunk, then consume accumulators */ if (counter != total/16) { CHURN4(cache,counter,-8,s,b,h,l); TAIL4(s,k); TAIL4(s,j); TAIL4(s,i); TAIL4(s,h); TAIL4(s,g); TAIL4(s,f); TAIL4(s,e); TAIL4(s,d); TAIL4(s,c); TAIL4(s,b); } else { TAIL4(s,l); TAIL4(s,k); TAIL4(s,j); TAIL4(s,i); TAIL4(s,h); TAIL4(s,g); TAIL4(s,f); TAIL4(s,e); TAIL4(s,d); TAIL4(s,c); } /* mix s0..s3 to produce the final result */ FINAL4(s); val.h = s0; return val; } } /* hash a message all at once, with a key */ z128 keyhash( const void *message, size_t mlen, const void *key, size_t klen) { size_t len = mlen + klen; if (len <= 15) { /* really short message */ z128 val; val.h = _mm_set_epi32(0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef); memcpy(((char *)val.x), message, mlen); memcpy(((char *)val.x)+mlen, key, klen); ((char *)val.x)[15] = 1 + mlen + (klen << 4); FINAL1(val.h); return val; } else if (mlen <= LARGE) { __m128i buf[BLOCK]; buf[mlen/16] = _mm_setzero_si128(); memcpy(buf, message, mlen); return midhash( buf, mlen, key, klen); } else { /* long message: do it the normal way */ zorba z; init(&z); update(&z, message, mlen); return final(&z, key, klen); } } /* hash a message all at once, without a key */ z128 hash( const void *message, size_t mlen) { if (mlen <= 15) { /* really short message */ z128 val; val.h = _mm_set_epi32(0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef); memcpy(((char *)val.x), message, mlen); ((char *)val.x)[15] = 1 + mlen; FINAL1(val.h); return val; } else if (mlen <= LARGE) { char key[1]; __m128i buf[BLOCK]; buf[mlen/16] = _mm_setzero_si128(); memcpy(buf, message, mlen); return midhash( buf, mlen, key, 0); } else { /* long message: do it the normal way */ zorba z; char key[1]; init(&z); update(&z, message, mlen); return final(&z, key, 0); } } /* check that every input bit changes every output bit half the time */ #define HASHSTATE 1 #define MAXPAIR 40 #define MAXLEN 1000 void driver2() { u1 qa[MAXLEN+1], qb[MAXLEN+2], *a = &qa[0], *b = &qb[1], m=0; u8 c[HASHSTATE], d[HASHSTATE], i=0, j=0, k, l, z; u8 e[HASHSTATE],f[HASHSTATE],g[HASHSTATE],h[HASHSTATE]; u8 x[HASHSTATE],y[HASHSTATE]; u4 hlen; printf("No more than %d trials should ever be needed \n",MAXPAIR); for (hlen=0; hlen < MAXLEN; ++hlen) { z=0; for (i=0; i>(8-j)); c[0] = keyhash64(a, hlen, &m, 1); b[i] ^= ((k+1)<>(8-j)); d[0] = keyhash64(b, hlen, &m, 1); /* check every bit is 1, 0, set, and not set at least once */ for (l=0; lz) z=k; if (k==2*MAXPAIR) { printf("Some bit didn't change: "); printf("%.16llx %.16llx %.16llx %.16llx %.16llx %.16llx ", e[0],f[0],g[0],h[0],x[0],y[0]); printf("i %d j %d m %d len %d\n", (u4)i, (u4)j, (u4)m, (u4)hlen); } if (z==2*MAXPAIR) goto done; } } } done: if (z < 2*MAXPAIR) { printf("Mix success %2d bytes %2d initvals ",i,m); printf("required %d trials\n", z/2); } } printf("\n"); } #define HASH_LITTLE_ENDIAN 1 #define hashsize(n) ((u4)1<<(n)) #define hashmask(n) (hashsize(n)-1) #define rot(x,k) (((x)<<(k)) ^ ((x)>>(32-(k)))) #define mix(a,b,c) \ { \ a -= c; a ^= rot(c, 4); c += b; \ b -= a; b ^= rot(a, 6); a += c; \ c -= b; c ^= rot(b, 8); b += a; \ a -= c; a ^= rot(c,16); c += b; \ b -= a; b ^= rot(a,19); a += c; \ c -= b; c ^= rot(b, 4); b += a; \ } #define final(a,b,c) \ { \ c ^= b; c -= rot(b,14); \ a ^= c; a -= rot(c,11); \ b ^= a; b -= rot(a,25); \ c ^= b; c -= rot(b,16); \ a ^= c; a -= rot(c,4); \ b ^= a; b -= rot(a,14); \ c ^= b; c -= rot(b,24); \ } u4 hashlittle( const void *key, size_t length, u4 initval) { u4 a,b,c; /* internal state */ union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ /* Set up the internal state */ a = b = c = 0xdeadbeef + ((u4)length) + initval; u.ptr = key; if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { const u4 *k = key; /* read 32-bit chunks */ const u1 *k8; /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ while (length > 12) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 12; k += 3; } /*----------------------------- handle the last (probably partial) block */ switch(length) { case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; case 8 : b+=k[1]; a+=k[0]; break; case 7 : b+=k[1]&0xffffff; a+=k[0]; break; case 6 : b+=k[1]&0xffff; a+=k[0]; break; case 5 : b+=k[1]&0xff; a+=k[0]; break; case 4 : a+=k[0]; break; case 3 : a+=k[0]&0xffffff; break; case 2 : a+=k[0]&0xffff; break; case 1 : a+=k[0]&0xff; break; case 0 : return c; /* zero length strings require no mixing */ } } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { const u2 *k = key; /* read 16-bit chunks */ const u1 *k8; /*--------------- all but last block: aligned reads and different mixing */ while (length > 12) { a += k[0] + (((u4)k[1])<<16); b += k[2] + (((u4)k[3])<<16); c += k[4] + (((u4)k[5])<<16); mix(a,b,c); length -= 12; k += 6; } /*----------------------------- handle the last (probably partial) block */ k8 = (const u1 *)k; switch(length) { case 12: c+=k[4]+(((u4)k[5])<<16); b+=k[2]+(((u4)k[3])<<16); a+=k[0]+(((u4)k[1])<<16); break; case 11: c+=((u4)k8[10])<<16; /* fall through */ case 10: c+=k[4]; b+=k[2]+(((u4)k[3])<<16); a+=k[0]+(((u4)k[1])<<16); break; case 9 : c+=k8[8]; /* fall through */ case 8 : b+=k[2]+(((u4)k[3])<<16); a+=k[0]+(((u4)k[1])<<16); break; case 7 : b+=((u4)k8[6])<<16; /* fall through */ case 6 : b+=k[2]; a+=k[0]+(((u4)k[1])<<16); break; case 5 : b+=k8[4]; /* fall through */ case 4 : a+=k[0]+(((u4)k[1])<<16); break; case 3 : a+=((u4)k8[2])<<16; /* fall through */ case 2 : a+=k[0]; break; case 1 : a+=k8[0]; break; case 0 : return c; /* zero length requires no mixing */ } } else { /* need to read the key one byte at a time */ const u1 *k = key; /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += k[0]; a += ((u4)k[1])<<8; a += ((u4)k[2])<<16; a += ((u4)k[3])<<24; b += k[4]; b += ((u4)k[5])<<8; b += ((u4)k[6])<<16; b += ((u4)k[7])<<24; c += k[8]; c += ((u4)k[9])<<8; c += ((u4)k[10])<<16; c += ((u4)k[11])<<24; mix(a,b,c); length -= 12; k += 12; } /*-------------------------------- last block: affect all 32 bits of (c) */ switch(length) /* all the case statements fall through */ { case 12: c+=((u4)k[11])<<24; case 11: c+=((u4)k[10])<<16; case 10: c+=((u4)k[9])<<8; case 9 : c+=k[8]; case 8 : b+=((u4)k[7])<<24; case 7 : b+=((u4)k[6])<<16; case 6 : b+=((u4)k[5])<<8; case 5 : b+=k[4]; case 4 : a+=((u4)k[3])<<24; case 3 : a+=((u4)k[2])<<16; case 2 : a+=((u4)k[1])<<8; case 1 : a+=k[0]; break; case 0 : return c; } } final(a,b,c); return c; } #define MSIZE (1<<12) int main() { int i; u8 a,z; u8 val; u4 val4; __m128i message[MSIZE+1]; zorba zzz; memset(message, 42, sizeof(message)); a = GetTickCount(); for (i=0; i<(1<<20); ++i) { val4 += hashlittle(message, 200, 0); } #ifdef NEVER init(&zzz); for (i=0; i<(1<<18); ++i) { update(&zzz, &((char *)message)[0], 16*MSIZE); } val = final(&zzz, message, 0); for (i=0; i