Hash.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  *
00010  * The MD5 and Murmor hashing functions were integrated from public sources.
00011  * Their respective copyrights follow.
00012  *
00013  * MD5
00014  *
00015  * This code implements the MD5 message-digest algorithm.
00016  * The algorithm is due to Ron Rivest.  This code was
00017  * written by Colin Plumb in 1993, no copyright is claimed.
00018  * This code is in the public domain; do with it what you wish.
00019  *
00020  * Equivalent code is available from RSA Data Security, Inc.
00021  * This code has been tested against that, and is equivalent,
00022  * except that you don't need to include two pages of legalese
00023  * with every copy.
00024  *
00025  * To compute the message digest of a chunk of bytes, declare an
00026  * MD5Context structure, pass it to MD5Init, call MD5Update as
00027  * needed on buffers full of bytes, and then call MD5Final, which
00028  * will fill a supplied 16-byte array with the digest.
00029  *
00030  * MurmurHash2
00031  *
00032  * (C) Austin Appleby, released under the MIT License
00033  * 
00034  *  Note - This code makes a few assumptions about how your machine behaves -
00035  * 
00036  *  1. We can read a 4-byte value from any address without crashing
00037  *  2. It will not produce the same results on little-endian and big-endian
00038  *     machines.
00039  */
00040 
00041 #include <shogun/lib/common.h>
00042 #include <shogun/lib/Hash.h>
00043 #include <ctype.h>
00044 
00045 using namespace shogun;
00046 
00047 uint32_t CHash::crc32(uint8_t *data, int32_t len)
00048 {
00049     uint32_t result;
00050     int32_t i,j;
00051     uint8_t octet;
00052 
00053     result = 0-1;
00054     for (i=0; i<len; i++)
00055     {
00056         octet = *(data++);
00057         for (j=0; j<8; j++)
00058         {
00059             if ((octet >> 7) ^ (result >> 31))
00060             {
00061                 result = (result << 1) ^ 0x04c11db7;
00062             }
00063             else
00064             {
00065                 result = (result << 1);
00066             }
00067             octet <<= 1;
00068         }
00069     }
00070 
00071     return ~result; 
00072 }
00073 
00074 void CHash::MD5(unsigned char *x, unsigned l, unsigned char *buf)
00075 {
00076     struct MD5Context ctx;
00077 
00078     MD5Init(&ctx);
00079     MD5Update(&ctx, x, l);
00080     MD5Final(buf, &ctx);
00081 }
00082 
00083 #ifndef HIGHFIRST
00084 #define byteReverse(buf, len)   /* Nothing */
00085 #else
00086 void byteReverse(unsigned char *buf, unsigned uint32_t longs);
00087 
00088 #ifndef ASM_MD5
00089 /*
00090  * Note: this code is harmless on little-endian machines.
00091  */
00092 void byteReverse(unsigned char *buf, unsigned uint32_t longs)
00093 {
00094     uint32_t t;
00095     do {
00096         t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
00097             ((unsigned) buf[1] << 8 | buf[0]);
00098         *(uint32_t *) buf = t;
00099         buf += 4;
00100     } while (--longs);
00101 }
00102 #endif
00103 #endif
00104 
00105 void CHash::MD5Init(struct MD5Context *ctx)
00106 {
00107     ctx->buf[0] = 0x67452301;
00108     ctx->buf[1] = 0xefcdab89;
00109     ctx->buf[2] = 0x98badcfe;
00110     ctx->buf[3] = 0x10325476;
00111 
00112     ctx->bits[0] = 0;
00113     ctx->bits[1] = 0;
00114 }
00115 
00116 void CHash::MD5Update(struct MD5Context *ctx, unsigned char const *buf,
00117                unsigned len)
00118 {
00119     uint32_t t;
00120 
00121     /* Update bitcount */
00122 
00123     t = ctx->bits[0];
00124     if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
00125         ctx->bits[1]++;         /* Carry from low to high */
00126     ctx->bits[1] += len >> 29;
00127 
00128     t = (t >> 3) & 0x3f;        /* Bytes already in shsInfo->data */
00129 
00130     /* Handle any leading odd-sized chunks */
00131 
00132     if (t) {
00133         unsigned char *p = (unsigned char *) ctx->in + t;
00134 
00135         t = 64 - t;
00136         if (len < t) {
00137             memcpy(p, buf, len);
00138             return;
00139         }
00140         memcpy(p, buf, t);
00141         byteReverse(ctx->in, 16);
00142         MD5Transform(ctx->buf, (uint32_t *) ctx->in);
00143         buf += t;
00144         len -= t;
00145     }
00146     /* Process data in 64-byte chunks */
00147 
00148     while (len >= 64) {
00149         memcpy(ctx->in, buf, 64);
00150         byteReverse(ctx->in, 16);
00151         MD5Transform(ctx->buf, (uint32_t *) ctx->in);
00152         buf += 64;
00153         len -= 64;
00154     }
00155 
00156     /* Handle any remaining bytes of data. */
00157 
00158     memcpy(ctx->in, buf, len);
00159 }
00160 
00161 void CHash::MD5Final(unsigned char digest[16], struct MD5Context *ctx)
00162 {
00163     unsigned count;
00164     unsigned char *p;
00165 
00166     /* Compute number of bytes mod 64 */
00167     count = (ctx->bits[0] >> 3) & 0x3F;
00168 
00169     /* Set the first char of padding to 0x80.  This is safe since there is
00170        always at least one byte free */
00171     p = ctx->in + count;
00172     *p++ = 0x80;
00173 
00174     /* Bytes of padding needed to make 64 bytes */
00175     count = 64 - 1 - count;
00176 
00177     /* Pad out to 56 mod 64 */
00178     if (count < 8) {
00179         /* Two lots of padding:  Pad the first block to 64 bytes */
00180         memset(p, 0, count);
00181         byteReverse(ctx->in, 16);
00182         MD5Transform(ctx->buf, (uint32_t *) ctx->in);
00183 
00184         /* Now fill the next block with 56 bytes */
00185         memset(ctx->in, 0, 56);
00186     } else {
00187         /* Pad block to 56 bytes */
00188         memset(p, 0, count - 8);
00189     }
00190     byteReverse(ctx->in, 14);
00191 
00192     /* Append length in bits and transform */
00193     ((uint32_t *) ctx->in)[14] = ctx->bits[0];
00194     ((uint32_t *) ctx->in)[15] = ctx->bits[1];
00195 
00196     MD5Transform(ctx->buf, (uint32_t *) ctx->in);
00197     byteReverse((unsigned char *) ctx->buf, 4);
00198     memcpy(digest, ctx->buf, 16);
00199     memset(ctx, 0, sizeof(ctx));        /* In case it's sensitive */
00200 }
00201 
00202 #ifndef ASM_MD5
00203 
00204 /* The four core functions - F1 is optimized somewhat */
00205 
00206 /* #define F1(x, y, z) (x & y | ~x & z) */
00207 #define F1(x, y, z) (z ^ (x & (y ^ z)))
00208 #define F2(x, y, z) F1(z, x, y)
00209 #define F3(x, y, z) (x ^ y ^ z)
00210 #define F4(x, y, z) (y ^ (x | ~z))
00211 
00212 /* This is the central step in the MD5 algorithm. */
00213 #ifdef __PUREC__
00214 #define MD5STEP(f, w, x, y, z, data, s) \
00215     ( w += f /*(x, y, z)*/ + data,  w = w<<s | w>>(32-s),  w += x )
00216 #else
00217 #define MD5STEP(f, w, x, y, z, data, s) \
00218     ( w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x )
00219 #endif
00220 
00221 void CHash::MD5Transform(uint32_t buf[4], uint32_t const in[16])
00222 {
00223     register uint32_t a, b, c, d;
00224 
00225     a = buf[0];
00226     b = buf[1];
00227     c = buf[2];
00228     d = buf[3];
00229 
00230 #ifdef __PUREC__                /* PureC Weirdness... (GG) */
00231     MD5STEP(F1(b, c, d), a, b, c, d, in[0] + 0xd76aa478L, 7);
00232     MD5STEP(F1(a, b, c), d, a, b, c, in[1] + 0xe8c7b756L, 12);
00233     MD5STEP(F1(d, a, b), c, d, a, b, in[2] + 0x242070dbL, 17);
00234     MD5STEP(F1(c, d, a), b, c, d, a, in[3] + 0xc1bdceeeL, 22);
00235     MD5STEP(F1(b, c, d), a, b, c, d, in[4] + 0xf57c0fafL, 7);
00236     MD5STEP(F1(a, b, c), d, a, b, c, in[5] + 0x4787c62aL, 12);
00237     MD5STEP(F1(d, a, b), c, d, a, b, in[6] + 0xa8304613L, 17);
00238     MD5STEP(F1(c, d, a), b, c, d, a, in[7] + 0xfd469501L, 22);
00239     MD5STEP(F1(b, c, d), a, b, c, d, in[8] + 0x698098d8L, 7);
00240     MD5STEP(F1(a, b, c), d, a, b, c, in[9] + 0x8b44f7afL, 12);
00241     MD5STEP(F1(d, a, b), c, d, a, b, in[10] + 0xffff5bb1L, 17);
00242     MD5STEP(F1(c, d, a), b, c, d, a, in[11] + 0x895cd7beL, 22);
00243     MD5STEP(F1(b, c, d), a, b, c, d, in[12] + 0x6b901122L, 7);
00244     MD5STEP(F1(a, b, c), d, a, b, c, in[13] + 0xfd987193L, 12);
00245     MD5STEP(F1(d, a, b), c, d, a, b, in[14] + 0xa679438eL, 17);
00246     MD5STEP(F1(c, d, a), b, c, d, a, in[15] + 0x49b40821L, 22);
00247 
00248     MD5STEP(F2(b, c, d), a, b, c, d, in[1] + 0xf61e2562L, 5);
00249     MD5STEP(F2(a, b, c), d, a, b, c, in[6] + 0xc040b340L, 9);
00250     MD5STEP(F2(d, a, b), c, d, a, b, in[11] + 0x265e5a51L, 14);
00251     MD5STEP(F2(c, d, a), b, c, d, a, in[0] + 0xe9b6c7aaL, 20);
00252     MD5STEP(F2(b, c, d), a, b, c, d, in[5] + 0xd62f105dL, 5);
00253     MD5STEP(F2(a, b, c), d, a, b, c, in[10] + 0x02441453L, 9);
00254     MD5STEP(F2(d, a, b), c, d, a, b, in[15] + 0xd8a1e681L, 14);
00255     MD5STEP(F2(c, d, a), b, c, d, a, in[4] + 0xe7d3fbc8L, 20);
00256     MD5STEP(F2(b, c, d), a, b, c, d, in[9] + 0x21e1cde6L, 5);
00257     MD5STEP(F2(a, b, c), d, a, b, c, in[14] + 0xc33707d6L, 9);
00258     MD5STEP(F2(d, a, b), c, d, a, b, in[3] + 0xf4d50d87L, 14);
00259     MD5STEP(F2(c, d, a), b, c, d, a, in[8] + 0x455a14edL, 20);
00260     MD5STEP(F2(b, c, d), a, b, c, d, in[13] + 0xa9e3e905L, 5);
00261     MD5STEP(F2(a, b, c), d, a, b, c, in[2] + 0xfcefa3f8L, 9);
00262     MD5STEP(F2(d, a, b), c, d, a, b, in[7] + 0x676f02d9L, 14);
00263     MD5STEP(F2(c, d, a), b, c, d, a, in[12] + 0x8d2a4c8aL, 20);
00264 
00265     MD5STEP(F3(b, c, d), a, b, c, d, in[5] + 0xfffa3942L, 4);
00266     MD5STEP(F3(a, b, c), d, a, b, c, in[8] + 0x8771f681L, 11);
00267     MD5STEP(F3(d, a, b), c, d, a, b, in[11] + 0x6d9d6122L, 16);
00268     MD5STEP(F3(c, d, a), b, c, d, a, in[14] + 0xfde5380cL, 23);
00269     MD5STEP(F3(b, c, d), a, b, c, d, in[1] + 0xa4beea44L, 4);
00270     MD5STEP(F3(a, b, c), d, a, b, c, in[4] + 0x4bdecfa9L, 11);
00271     MD5STEP(F3(d, a, b), c, d, a, b, in[7] + 0xf6bb4b60L, 16);
00272     MD5STEP(F3(c, d, a), b, c, d, a, in[10] + 0xbebfbc70L, 23);
00273     MD5STEP(F3(b, c, d), a, b, c, d, in[13] + 0x289b7ec6L, 4);
00274     MD5STEP(F3(a, b, c), d, a, b, c, in[0] + 0xeaa127faL, 11);
00275     MD5STEP(F3(d, a, b), c, d, a, b, in[3] + 0xd4ef3085L, 16);
00276     MD5STEP(F3(c, d, a), b, c, d, a, in[6] + 0x04881d05L, 23);
00277     MD5STEP(F3(b, c, d), a, b, c, d, in[9] + 0xd9d4d039L, 4);
00278     MD5STEP(F3(a, b, c), d, a, b, c, in[12] + 0xe6db99e5L, 11);
00279     MD5STEP(F3(d, a, b), c, d, a, b, in[15] + 0x1fa27cf8L, 16);
00280     MD5STEP(F3(c, d, a), b, c, d, a, in[2] + 0xc4ac5665L, 23);
00281 
00282     MD5STEP(F4(b, c, d), a, b, c, d, in[0] + 0xf4292244L, 6);
00283     MD5STEP(F4(a, b, c), d, a, b, c, in[7] + 0x432aff97L, 10);
00284     MD5STEP(F4(d, a, b), c, d, a, b, in[14] + 0xab9423a7L, 15);
00285     MD5STEP(F4(c, d, a), b, c, d, a, in[5] + 0xfc93a039L, 21);
00286     MD5STEP(F4(b, c, d), a, b, c, d, in[12] + 0x655b59c3L, 6);
00287     MD5STEP(F4(a, b, c), d, a, b, c, in[3] + 0x8f0ccc92L, 10);
00288     MD5STEP(F4(d, a, b), c, d, a, b, in[10] + 0xffeff47dL, 15);
00289     MD5STEP(F4(c, d, a), b, c, d, a, in[1] + 0x85845dd1L, 21);
00290     MD5STEP(F4(b, c, d), a, b, c, d, in[8] + 0x6fa87e4fL, 6);
00291     MD5STEP(F4(a, b, c), d, a, b, c, in[15] + 0xfe2ce6e0L, 10);
00292     MD5STEP(F4(d, a, b), c, d, a, b, in[6] + 0xa3014314L, 15);
00293     MD5STEP(F4(c, d, a), b, c, d, a, in[13] + 0x4e0811a1L, 21);
00294     MD5STEP(F4(b, c, d), a, b, c, d, in[4] + 0xf7537e82L, 6);
00295     MD5STEP(F4(a, b, c), d, a, b, c, in[11] + 0xbd3af235L, 10);
00296     MD5STEP(F4(d, a, b), c, d, a, b, in[2] + 0x2ad7d2bbL, 15);
00297     MD5STEP(F4(c, d, a), b, c, d, a, in[9] + 0xeb86d391L, 21);
00298 #else
00299     MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
00300     MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
00301     MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
00302     MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
00303     MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
00304     MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
00305     MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
00306     MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
00307     MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
00308     MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
00309     MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
00310     MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
00311     MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
00312     MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
00313     MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
00314     MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
00315 
00316     MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
00317     MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
00318     MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
00319     MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
00320     MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
00321     MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
00322     MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
00323     MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
00324     MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
00325     MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
00326     MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
00327     MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
00328     MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
00329     MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
00330     MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
00331     MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
00332 
00333     MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
00334     MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
00335     MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
00336     MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
00337     MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
00338     MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
00339     MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
00340     MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
00341     MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
00342     MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
00343     MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
00344     MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
00345     MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
00346     MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
00347     MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
00348     MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
00349 
00350     MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
00351     MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
00352     MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
00353     MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
00354     MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
00355     MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
00356     MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
00357     MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
00358     MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
00359     MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
00360     MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
00361     MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
00362     MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
00363     MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
00364     MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
00365     MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
00366 #endif
00367 
00368     buf[0] += a;
00369     buf[1] += b;
00370     buf[2] += c;
00371     buf[3] += d;
00372 }
00373 #endif
00374 
00375 uint32_t CHash::MurmurHash2(uint8_t* data, int32_t len, uint32_t seed)
00376 {
00377     // 'm' and 'r' are mixing constants generated offline.
00378     // They're not really 'magic', they just happen to work well.
00379 
00380     const uint32_t m = 0x5bd1e995;
00381     const int32_t r = 24;
00382 
00383     // Initialize the hash to a 'random' value
00384 
00385     uint32_t h = seed ^ len;
00386 
00387     // Mix 4 bytes at a time into the hash
00388 
00389     while(len >= 4)
00390     {
00391         uint32_t k = *(uint32_t *)data;
00392 
00393         k *= m; 
00394         k ^= k >> r; 
00395         k *= m; 
00396 
00397         h *= m; 
00398         h ^= k;
00399 
00400         data += 4;
00401         len -= 4;
00402     }
00403 
00404     // Handle the last few bytes of the input array
00405 
00406     switch(len)
00407     {
00408         case 3: h ^= data[2] << 16;
00409         case 2: h ^= data[1] << 8;
00410         case 1: h ^= data[0];
00411                 h *= m;
00412     };
00413 
00414     // Do a few final mixes of the hash to ensure the last few
00415     // bytes are well-incorporated.
00416 
00417     h ^= h >> 13;
00418     h *= m;
00419     h ^= h >> 15;
00420 
00421     return h;
00422 } 
00423 
00424 uint32_t CHash::IncrementalMurmurHash2(uint8_t data, uint32_t h)
00425 {
00426     // 'm' and 'r' are mixing constants generated offline.
00427     // They're not really 'magic', they just happen to work well.
00428 
00429     const uint32_t m = 0x5bd1e995;
00430 
00431     h ^= data;
00432     h *= m;
00433 
00434     // Do a few final mixes of the hash to ensure the last few
00435     // bytes are well-incorporated.
00436 
00437     h ^= h >> 13;
00438     h *= m;
00439     h ^= h >> 15;
00440 
00441     return h;
00442 } 
00443 
00444 uint32_t CHash::MurmurHashString(substring s, uint32_t h)
00445 {
00446     uint32_t ret = 0;
00447 
00448     // Trim leading whitespace
00449     for(; *(s.start) <= 0x20 && s.start < s.end; s.start++);
00450 
00451     // Trim trailing white space
00452     for(; *(s.end-1) <= 0x20 && s.end > s.start; s.end--);
00453 
00454     char *p = s.start;
00455     while (p != s.end)
00456         if (isdigit(*p))
00457             ret = 10*ret + *(p++) - '0';
00458         else
00459             return MurmurHash2((uint8_t *)s.start, s.end - s.start, h);
00460 
00461     return ret + h;
00462 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation