SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Hash.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Soeren Sonnenburg
8  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  *
10  * The MD5 hashing function was integrated from public sources.
11  * Its copyright follows.
12  *
13  * MD5
14  *
15  * This code implements the MD5 message-digest algorithm.
16  * The algorithm is due to Ron Rivest. This code was
17  * written by Colin Plumb in 1993, no copyright is claimed.
18  * This code is in the public domain; do with it what you wish.
19  *
20  * Equivalent code is available from RSA Data Security, Inc.
21  * This code has been tested against that, and is equivalent,
22  * except that you don't need to include two pages of legalese
23  * with every copy.
24  *
25  * To compute the message digest of a chunk of bytes, declare an
26  * MD5Context structure, pass it to MD5Init, call MD5Update as
27  * needed on buffers full of bytes, and then call MD5Final, which
28  * will fill a supplied 16-byte array with the digest.
29  */
30 
31 #include <shogun/lib/Hash.h>
32 #include <shogun/io/SGIO.h>
33 #include <shogun/lib/external/PMurHash.h>
34 #include <ctype.h>
35 
36 using namespace shogun;
37 
38 uint32_t CHash::crc32(uint8_t *data, int32_t len)
39 {
40  uint32_t result;
41  int32_t i,j;
42  uint8_t octet;
43 
44  result = 0-1;
45  for (i=0; i<len; i++)
46  {
47  octet = *(data++);
48  for (j=0; j<8; j++)
49  {
50  if ((octet >> 7) ^ (result >> 31))
51  {
52  result = (result << 1) ^ 0x04c11db7;
53  }
54  else
55  {
56  result = (result << 1);
57  }
58  octet <<= 1;
59  }
60  }
61 
62  return ~result;
63 }
64 
65 void CHash::MD5(unsigned char *x, unsigned l, unsigned char *buf)
66 {
67  struct MD5Context ctx;
68 
69  MD5Init(&ctx);
70  MD5Update(&ctx, x, l);
71  MD5Final(buf, &ctx);
72 }
73 
74 #ifndef HIGHFIRST
75 #define byteReverse(buf, len) /* Nothing */
76 #else
77 void byteReverse(unsigned char *buf, unsigned uint32_t longs);
78 
79 #ifndef ASM_MD5
80 /*
81  * Note: this code is harmless on little-endian machines.
82  */
83 void byteReverse(unsigned char *buf, unsigned uint32_t longs)
84 {
85  uint32_t t;
86  do {
87  t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
88  ((unsigned) buf[1] << 8 | buf[0]);
89  *(uint32_t *) buf = t;
90  buf += 4;
91  } while (--longs);
92 }
93 #endif
94 #endif
95 
96 void CHash::MD5Init(struct MD5Context *ctx)
97 {
98  ctx->buf[0] = 0x67452301;
99  ctx->buf[1] = 0xefcdab89;
100  ctx->buf[2] = 0x98badcfe;
101  ctx->buf[3] = 0x10325476;
102 
103  ctx->bits[0] = 0;
104  ctx->bits[1] = 0;
105 }
106 
107 void CHash::MD5Update(struct MD5Context *ctx, unsigned char const *buf,
108  unsigned len)
109 {
110  uint32_t t;
111 
112  /* Update bitcount */
113 
114  t = ctx->bits[0];
115  if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
116  ctx->bits[1]++; /* Carry from low to high */
117  ctx->bits[1] += len >> 29;
118 
119  t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
120 
121  /* Handle any leading odd-sized chunks */
122 
123  if (t) {
124  unsigned char *p = (unsigned char *) ctx->in + t;
125 
126  t = 64 - t;
127  if (len < t) {
128  memcpy(p, buf, len);
129  return;
130  }
131  memcpy(p, buf, t);
132  byteReverse(ctx->in, 16);
133  MD5Transform(ctx->buf, (uint32_t *) ctx->in);
134  buf += t;
135  len -= t;
136  }
137  /* Process data in 64-byte chunks */
138 
139  while (len >= 64) {
140  memcpy(ctx->in, buf, 64);
141  byteReverse(ctx->in, 16);
142  MD5Transform(ctx->buf, (uint32_t *) ctx->in);
143  buf += 64;
144  len -= 64;
145  }
146 
147  /* Handle any remaining bytes of data. */
148 
149  memcpy(ctx->in, buf, len);
150 }
151 
152 void CHash::MD5Final(unsigned char digest[16], struct MD5Context *ctx)
153 {
154  unsigned count;
155  unsigned char *p;
156 
157  /* Compute number of bytes mod 64 */
158  count = (ctx->bits[0] >> 3) & 0x3F;
159 
160  /* Set the first char of padding to 0x80. This is safe since there is
161  always at least one byte free */
162  p = ctx->in + count;
163  *p++ = 0x80;
164 
165  /* Bytes of padding needed to make 64 bytes */
166  count = 64 - 1 - count;
167 
168  /* Pad out to 56 mod 64 */
169  if (count < 8) {
170  /* Two lots of padding: Pad the first block to 64 bytes */
171  memset(p, 0, count);
172  byteReverse(ctx->in, 16);
173  MD5Transform(ctx->buf, (uint32_t *) ctx->in);
174 
175  /* Now fill the next block with 56 bytes */
176  memset(ctx->in, 0, 56);
177  } else {
178  /* Pad block to 56 bytes */
179  memset(p, 0, count - 8);
180  }
181  byteReverse(ctx->in, 14);
182 
183  /* Append length in bits and transform */
184  ctx->uin[14] = ctx->bits[0];
185  ctx->uin[15] = ctx->bits[1];
186 
187  MD5Transform(ctx->buf, (uint32_t *) ctx->in);
188  byteReverse((unsigned char *) ctx->buf, 4);
189  memcpy(digest, ctx->buf, 16);
190  memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
191 }
192 
193 #ifndef ASM_MD5
194 
195 /* The four core functions - F1 is optimized somewhat */
196 
197 /* #define F1(x, y, z) (x & y | ~x & z) */
198 #define F1(x, y, z) (z ^ (x & (y ^ z)))
199 #define F2(x, y, z) F1(z, x, y)
200 #define F3(x, y, z) (x ^ y ^ z)
201 #define F4(x, y, z) (y ^ (x | ~z))
202 
203 /* This is the central step in the MD5 algorithm. */
204 #ifdef __PUREC__
205 #define MD5STEP(f, w, x, y, z, data, s) \
206  ( w += f /*(x, y, z)*/ + data, w = w<<s | w>>(32-s), w += x )
207 #else
208 #define MD5STEP(f, w, x, y, z, data, s) \
209  ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
210 #endif
211 
212 void CHash::MD5Transform(uint32_t buf[4], uint32_t const in[16])
213 {
214  register uint32_t a, b, c, d;
215 
216  a = buf[0];
217  b = buf[1];
218  c = buf[2];
219  d = buf[3];
220 
221 #ifdef __PUREC__ /* PureC Weirdness... (GG) */
222  MD5STEP(F1(b, c, d), a, b, c, d, in[0] + 0xd76aa478L, 7);
223  MD5STEP(F1(a, b, c), d, a, b, c, in[1] + 0xe8c7b756L, 12);
224  MD5STEP(F1(d, a, b), c, d, a, b, in[2] + 0x242070dbL, 17);
225  MD5STEP(F1(c, d, a), b, c, d, a, in[3] + 0xc1bdceeeL, 22);
226  MD5STEP(F1(b, c, d), a, b, c, d, in[4] + 0xf57c0fafL, 7);
227  MD5STEP(F1(a, b, c), d, a, b, c, in[5] + 0x4787c62aL, 12);
228  MD5STEP(F1(d, a, b), c, d, a, b, in[6] + 0xa8304613L, 17);
229  MD5STEP(F1(c, d, a), b, c, d, a, in[7] + 0xfd469501L, 22);
230  MD5STEP(F1(b, c, d), a, b, c, d, in[8] + 0x698098d8L, 7);
231  MD5STEP(F1(a, b, c), d, a, b, c, in[9] + 0x8b44f7afL, 12);
232  MD5STEP(F1(d, a, b), c, d, a, b, in[10] + 0xffff5bb1L, 17);
233  MD5STEP(F1(c, d, a), b, c, d, a, in[11] + 0x895cd7beL, 22);
234  MD5STEP(F1(b, c, d), a, b, c, d, in[12] + 0x6b901122L, 7);
235  MD5STEP(F1(a, b, c), d, a, b, c, in[13] + 0xfd987193L, 12);
236  MD5STEP(F1(d, a, b), c, d, a, b, in[14] + 0xa679438eL, 17);
237  MD5STEP(F1(c, d, a), b, c, d, a, in[15] + 0x49b40821L, 22);
238 
239  MD5STEP(F2(b, c, d), a, b, c, d, in[1] + 0xf61e2562L, 5);
240  MD5STEP(F2(a, b, c), d, a, b, c, in[6] + 0xc040b340L, 9);
241  MD5STEP(F2(d, a, b), c, d, a, b, in[11] + 0x265e5a51L, 14);
242  MD5STEP(F2(c, d, a), b, c, d, a, in[0] + 0xe9b6c7aaL, 20);
243  MD5STEP(F2(b, c, d), a, b, c, d, in[5] + 0xd62f105dL, 5);
244  MD5STEP(F2(a, b, c), d, a, b, c, in[10] + 0x02441453L, 9);
245  MD5STEP(F2(d, a, b), c, d, a, b, in[15] + 0xd8a1e681L, 14);
246  MD5STEP(F2(c, d, a), b, c, d, a, in[4] + 0xe7d3fbc8L, 20);
247  MD5STEP(F2(b, c, d), a, b, c, d, in[9] + 0x21e1cde6L, 5);
248  MD5STEP(F2(a, b, c), d, a, b, c, in[14] + 0xc33707d6L, 9);
249  MD5STEP(F2(d, a, b), c, d, a, b, in[3] + 0xf4d50d87L, 14);
250  MD5STEP(F2(c, d, a), b, c, d, a, in[8] + 0x455a14edL, 20);
251  MD5STEP(F2(b, c, d), a, b, c, d, in[13] + 0xa9e3e905L, 5);
252  MD5STEP(F2(a, b, c), d, a, b, c, in[2] + 0xfcefa3f8L, 9);
253  MD5STEP(F2(d, a, b), c, d, a, b, in[7] + 0x676f02d9L, 14);
254  MD5STEP(F2(c, d, a), b, c, d, a, in[12] + 0x8d2a4c8aL, 20);
255 
256  MD5STEP(F3(b, c, d), a, b, c, d, in[5] + 0xfffa3942L, 4);
257  MD5STEP(F3(a, b, c), d, a, b, c, in[8] + 0x8771f681L, 11);
258  MD5STEP(F3(d, a, b), c, d, a, b, in[11] + 0x6d9d6122L, 16);
259  MD5STEP(F3(c, d, a), b, c, d, a, in[14] + 0xfde5380cL, 23);
260  MD5STEP(F3(b, c, d), a, b, c, d, in[1] + 0xa4beea44L, 4);
261  MD5STEP(F3(a, b, c), d, a, b, c, in[4] + 0x4bdecfa9L, 11);
262  MD5STEP(F3(d, a, b), c, d, a, b, in[7] + 0xf6bb4b60L, 16);
263  MD5STEP(F3(c, d, a), b, c, d, a, in[10] + 0xbebfbc70L, 23);
264  MD5STEP(F3(b, c, d), a, b, c, d, in[13] + 0x289b7ec6L, 4);
265  MD5STEP(F3(a, b, c), d, a, b, c, in[0] + 0xeaa127faL, 11);
266  MD5STEP(F3(d, a, b), c, d, a, b, in[3] + 0xd4ef3085L, 16);
267  MD5STEP(F3(c, d, a), b, c, d, a, in[6] + 0x04881d05L, 23);
268  MD5STEP(F3(b, c, d), a, b, c, d, in[9] + 0xd9d4d039L, 4);
269  MD5STEP(F3(a, b, c), d, a, b, c, in[12] + 0xe6db99e5L, 11);
270  MD5STEP(F3(d, a, b), c, d, a, b, in[15] + 0x1fa27cf8L, 16);
271  MD5STEP(F3(c, d, a), b, c, d, a, in[2] + 0xc4ac5665L, 23);
272 
273  MD5STEP(F4(b, c, d), a, b, c, d, in[0] + 0xf4292244L, 6);
274  MD5STEP(F4(a, b, c), d, a, b, c, in[7] + 0x432aff97L, 10);
275  MD5STEP(F4(d, a, b), c, d, a, b, in[14] + 0xab9423a7L, 15);
276  MD5STEP(F4(c, d, a), b, c, d, a, in[5] + 0xfc93a039L, 21);
277  MD5STEP(F4(b, c, d), a, b, c, d, in[12] + 0x655b59c3L, 6);
278  MD5STEP(F4(a, b, c), d, a, b, c, in[3] + 0x8f0ccc92L, 10);
279  MD5STEP(F4(d, a, b), c, d, a, b, in[10] + 0xffeff47dL, 15);
280  MD5STEP(F4(c, d, a), b, c, d, a, in[1] + 0x85845dd1L, 21);
281  MD5STEP(F4(b, c, d), a, b, c, d, in[8] + 0x6fa87e4fL, 6);
282  MD5STEP(F4(a, b, c), d, a, b, c, in[15] + 0xfe2ce6e0L, 10);
283  MD5STEP(F4(d, a, b), c, d, a, b, in[6] + 0xa3014314L, 15);
284  MD5STEP(F4(c, d, a), b, c, d, a, in[13] + 0x4e0811a1L, 21);
285  MD5STEP(F4(b, c, d), a, b, c, d, in[4] + 0xf7537e82L, 6);
286  MD5STEP(F4(a, b, c), d, a, b, c, in[11] + 0xbd3af235L, 10);
287  MD5STEP(F4(d, a, b), c, d, a, b, in[2] + 0x2ad7d2bbL, 15);
288  MD5STEP(F4(c, d, a), b, c, d, a, in[9] + 0xeb86d391L, 21);
289 #else
290  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
291  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
292  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
293  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
294  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
295  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
296  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
297  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
298  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
299  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
300  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
301  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
302  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
303  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
304  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
305  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
306 
307  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
308  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
309  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
310  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
311  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
312  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
313  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
314  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
315  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
316  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
317  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
318  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
319  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
320  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
321  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
322  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
323 
324  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
325  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
326  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
327  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
328  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
329  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
330  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
331  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
332  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
333  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
334  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
335  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
336  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
337  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
338  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
339  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
340 
341  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
342  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
343  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
344  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
345  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
346  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
347  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
348  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
349  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
350  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
351  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
352  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
353  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
354  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
355  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
356  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
357 #endif
358 
359  buf[0] += a;
360  buf[1] += b;
361  buf[2] += c;
362  buf[3] += d;
363 }
364 #endif
365 
366 uint32_t CHash::MurmurHash3(uint8_t* data, int32_t len, uint32_t seed)
367 {
368  return PMurHash32(seed, data, len);
369 }
370 
371 void CHash::IncrementalMurmurHash3(uint32_t *ph1, uint32_t *pcarry, uint8_t* data, int32_t len)
372 {
373  PMurHash32_Process(ph1, pcarry, data, len);
374 }
375 
376 uint32_t CHash::FinalizeIncrementalMurmurHash3(uint32_t h, uint32_t carry, uint32_t total_length)
377 {
378  return PMurHash32_Result(h, carry, total_length);
379 }
380 
381 uint32_t CHash::MurmurHashString(substring s, uint32_t h)
382 {
383  uint32_t ret = 0;
384 
385  // Trim leading whitespace
386  for(; *(s.start) <= 0x20 && s.start < s.end; s.start++);
387 
388  // Trim trailing white space
389  for(; *(s.end-1) <= 0x20 && s.end > s.start; s.end--);
390 
391  char *p = s.start;
392  while (p != s.end)
393  if (isdigit(*p))
394  ret = 10*ret + *(p++) - '0';
395  else
396  return MurmurHash3((uint8_t *)s.start, s.end - s.start, h);
397 
398  return ret + h;
399 }

SHOGUN Machine Learning Toolbox - Documentation