1 #include "fch.h"
2 #include "cmph_structs.h"
3 #include "fch_structs.h"
4 #include "hash.h"
5 #include "bitbool.h"
6 #include "fch_buckets.h"
7 #include <math.h>
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <assert.h>
11 #include <string.h>
12 #include <errno.h>
13
14 #define INDEX 0 /* alignment index within a bucket */
15 //#define DEBUG
16 #include "debug.h"
17
18 static fch_buckets_t * mapping(cmph_config_t *mph);
19 static cmph_uint32 * ordering(fch_buckets_t * buckets);
20 static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes);
21 static void permut(cmph_uint32 * vector, cmph_uint32 n);
22 static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph_uint32 *sorted_indexes);
23
24 fch_config_data_t *fch_config_new(void)
25 {
26 fch_config_data_t *fch;
27 fch = (fch_config_data_t *)malloc(sizeof(fch_config_data_t));
28 assert(fch);
29 memset(fch, 0, sizeof(fch_config_data_t));
30 fch->hashfuncs[0] = CMPH_HASH_JENKINS;
31 fch->hashfuncs[1] = CMPH_HASH_JENKINS;
32 fch->m = fch->b = 0;
33 fch->c = fch->p1 = fch->p2 = 0.0;
34 fch->g = NULL;
35 fch->h1 = NULL;
36 fch->h2 = NULL;
37 return fch;
38 }
39
40 void fch_config_destroy(cmph_config_t *mph)
41 {
42 fch_config_data_t *data = (fch_config_data_t *)mph->data;
43 //DEBUGP("Destroying algorithm dependent data\n");
44 free(data);
45 }
46
47 void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
48 {
49 fch_config_data_t *fch = (fch_config_data_t *)mph->data;
50 CMPH_HASH *hashptr = hashfuncs;
51 cmph_uint32 i = 0;
52 while(*hashptr != CMPH_HASH_COUNT)
53 {
54 if (i >= 2) break; //fch only uses two hash functions
55 fch->hashfuncs[i] = *hashptr;
56 ++i, ++hashptr;
57 }
58 }
59
60 cmph_uint32 mixh10h11h12(cmph_uint32 b, double p1, double p2, cmph_uint32 initial_index)
61 {
62 register cmph_uint32 int_p2 = (cmph_uint32)p2;
63 if (initial_index < p1) initial_index %= int_p2; /* h11 o h10 */
64 else { /* h12 o h10 */
65 initial_index %= b;
66 if(initial_index < p2) initial_index += int_p2;
67 }
68 return initial_index;
69 }
70
71
72 cmph_uint32 fch_calc_b(double c, cmph_uint32 m)
73 {
74 return (cmph_uint32)ceil((c*m)/(log((double)m)/log(2.0) + 1));
75 }
76
77 double fch_calc_p1(cmph_uint32 m)
78 {
79 return ceil(0.55*m);
80 }
81
82 double fch_calc_p2(cmph_uint32 b)
83 {
84 return ceil(0.3*b);
85 }
86
87 static fch_buckets_t * mapping(cmph_config_t *mph)
88 {
89 cmph_uint32 i = 0;
90 fch_buckets_t *buckets = NULL;
91 fch_config_data_t *fch = (fch_config_data_t *)mph->data;
92 if (fch->h1) hash_state_destroy(fch->h1);
93 fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m);
94 fch->b = fch_calc_b(fch->c, fch->m);
95 fch->p1 = fch_calc_p1(fch->m);
96 fch->p2 = fch_calc_p2(fch->b);
97 //DEBUGP("b:%u p1:%f p2:%f\n", fch->b, fch->p1, fch->p2);
98 buckets = fch_buckets_new(fch->b);
99
100 mph->key_source->rewind(mph->key_source->data);
101 for(i = 0; i < fch->m; i++)
102 {
103 cmph_uint32 h1, keylen;
104 char *key = NULL;
105 mph->key_source->read(mph->key_source->data, &key, &keylen);
106 h1 = hash(fch->h1, key, keylen) % fch->m;
107 h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1);
108 fch_buckets_insert(buckets, h1, key, keylen);
109 key = NULL; // transger memory ownership
110
111 }
112 return buckets;
113 }
114
115
116 // returns the buckets indexes sorted by their sizes.
117 static cmph_uint32 * ordering(fch_buckets_t * buckets)
118 {
119 return fch_buckets_get_indexes_sorted_by_size(buckets);
120 }
121
122 /* Check whether function h2 causes collisions among the keys of each bucket */
123 static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes)
124 {
125 //cmph_uint32 max_size = fch_buckets_get_max_size(buckets);
126 cmph_uint8 * hashtable = (cmph_uint8 *)calloc((size_t)fch->m, sizeof(cmph_uint8));
127 cmph_uint32 nbuckets = fch_buckets_get_nbuckets(buckets);
128 cmph_uint32 i = 0, index = 0, j =0;
129 for (i = 0; i < nbuckets; i++)
130 {
131 cmph_uint32 nkeys = fch_buckets_get_size(buckets, sorted_indexes[i]);
132 memset(hashtable, 0, (size_t)fch->m);
133 //DEBUGP("bucket %u -- nkeys: %u\n", i, nkeys);
134 for (j = 0; j < nkeys; j++)
135 {
136 char * key = fch_buckets_get_key(buckets, sorted_indexes[i], j);
137 cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], j);
138 index = hash(fch->h2, key, keylen) % fch->m;
139 if(hashtable[index]) { // collision detected
140 free(hashtable);
141 return 1;
142 }
143 hashtable[index] = 1;
144 }
145 }
146 free(hashtable);
147 return 0;
148 }
149
150 static void permut(cmph_uint32 * vector, cmph_uint32 n)
151 {
152 cmph_uint32 i, j, b;
153 for (i = 0; i < n; i++) {
154 j = (cmph_uint32) rand() % n;
155 b = vector[i];
156 vector[i] = vector[j];
157 vector[j] = b;
158 }
159 }
160
161 static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph_uint32 *sorted_indexes)
162 {
163 cmph_uint32 * random_table = (cmph_uint32 *) calloc((size_t)fch->m, sizeof(cmph_uint32));
164 cmph_uint32 * map_table = (cmph_uint32 *) calloc((size_t)fch->m, sizeof(cmph_uint32));
165 cmph_uint32 iteration_to_generate_h2 = 0;
166 cmph_uint32 searching_iterations = 0;
167 cmph_uint8 restart = 0;
168 cmph_uint32 nbuckets = fch_buckets_get_nbuckets(buckets);
169 cmph_uint32 i, j, z, counter = 0, filled_count = 0;
170 if (fch->g) free (fch->g);
171 fch->g = (cmph_uint32 *) calloc((size_t)fch->b, sizeof(cmph_uint32));
172
173 //DEBUGP("max bucket size: %u\n", fch_buckets_get_max_size(buckets));
174
175 for(i = 0; i < fch->m; i++)
176 {
177 random_table[i] = i;
178 }
179 permut(random_table, fch->m);
180 for(i = 0; i < fch->m; i++)
181 {
182 map_table[random_table[i]] = i;
183 }
184 do {
185 if (fch->h2) hash_state_destroy(fch->h2);
186 fch->h2 = hash_state_new(fch->hashfuncs[1], fch->m);
187 restart = check_for_collisions_h2(fch, buckets, sorted_indexes);
188 filled_count = 0;
189 if (!restart)
190 {
191 searching_iterations++; iteration_to_generate_h2 = 0;
192 //DEBUGP("searching_iterations: %u\n", searching_iterations);
193 }
194 else {
195 iteration_to_generate_h2++;
196 //DEBUGP("iteration_to_generate_h2: %u\n", iteration_to_generate_h2);
197 }
198 for(i = 0; (i < nbuckets) && !restart; i++) {
199 cmph_uint32 bucketsize = fch_buckets_get_size(buckets, sorted_indexes[i]);
200 if (bucketsize == 0)
201 {
202 restart = 0; // false
203 break;
204 }
205 else restart = 1; // true
206 for(z = 0; (z < (fch->m - filled_count)) && restart; z++) {
207 char * key = fch_buckets_get_key(buckets, sorted_indexes[i], INDEX);
208 cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], INDEX);
209 cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m;
210 counter = 0;
211 restart = 0; // false
212 fch->g[sorted_indexes[i]] = (fch->m + random_table[filled_count + z] - h2) % fch->m;
213 //DEBUGP("g[%u]: %u\n", sorted_indexes[i], fch->g[sorted_indexes[i]]);
214 j = INDEX;
215 do {
216 cmph_uint32 index = 0;
217 key = fch_buckets_get_key(buckets, sorted_indexes[i], j);
218 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], j);
219 h2 = hash(fch->h2, key, keylen) % fch->m;
220 index = (h2 + fch->g[sorted_indexes[i]]) % fch->m;
221 //DEBUGP("key:%s keylen:%u index: %u h2:%u bucketsize:%u\n", key, keylen, index, h2, bucketsize);
222 if (map_table[index] >= filled_count) {
223 cmph_uint32 y = map_table[index];
224 cmph_uint32 ry = random_table[y];
225 random_table[y] = random_table[filled_count];
226 random_table[filled_count] = ry;
227 map_table[random_table[y]] = y;
228 map_table[random_table[filled_count]] = filled_count;
229 filled_count++;
230 counter ++;
231 }
232 else {
233 restart = 1; // true
234 filled_count = filled_count - counter;
235 counter = 0;
236 break;
237 }
238 j = (j + 1) % bucketsize;
239 } while(j % bucketsize != INDEX);
240 }
241 //getchar();
242 }
243 } while(restart && (searching_iterations < 10) && (iteration_to_generate_h2 < 1000));
244 free(map_table);
245 free(random_table);
246 return restart;
247 }
248
249
250
251 cmph_t *fch_new(cmph_config_t *mph, double c)
252 {
253 cmph_t *mphf = NULL;
254 fch_data_t *fchf = NULL;
255 cmph_uint32 iterations = 100;
256 cmph_uint8 restart_mapping = 0;
257 fch_buckets_t * buckets = NULL;
258 cmph_uint32 * sorted_indexes = NULL;
259 fch_config_data_t *fch = (fch_config_data_t *)mph->data;
260 fch->m = mph->key_source->nkeys;
261 //DEBUGP("m: %f\n", fch->m);
262 if (c <= 2) c = 2.6; // validating restrictions over parameter c.
263 fch->c = c;
264 //DEBUGP("c: %f\n", fch->c);
265 fch->h1 = NULL;
266 fch->h2 = NULL;
267 fch->g = NULL;
268 do
269 {
270 if (mph->verbosity)
271 {
272 fprintf(stderr, "Entering mapping step for mph creation of %u keys\n", fch->m);
273 }
274 if (buckets) fch_buckets_destroy(buckets);
275 buckets = mapping(mph);
276 if (mph->verbosity)
277 {
278 fprintf(stderr, "Starting ordering step\n");
279 }
280 if (sorted_indexes) free (sorted_indexes);
281 sorted_indexes = ordering(buckets);
282 if (mph->verbosity)
283 {
284 fprintf(stderr, "Starting searching step.\n");
285 }
286 restart_mapping = searching(fch, buckets, sorted_indexes);
287 iterations--;
288
289 } while(restart_mapping && iterations > 0);
290 if (buckets) fch_buckets_destroy(buckets);
291 if (sorted_indexes) free (sorted_indexes);
292 if (iterations == 0) return NULL;
293 mphf = (cmph_t *)malloc(sizeof(cmph_t));
294 mphf->algo = mph->algo;
295 fchf = (fch_data_t *)malloc(sizeof(fch_data_t));
296 fchf->g = fch->g;
297 fch->g = NULL; //transfer memory ownership
298 fchf->h1 = fch->h1;
299 fch->h1 = NULL; //transfer memory ownership
300 fchf->h2 = fch->h2;
301 fch->h2 = NULL; //transfer memory ownership
302 fchf->p2 = fch->p2;
303 fchf->p1 = fch->p1;
304 fchf->b = fch->b;
305 fchf->c = fch->c;
306 fchf->m = fch->m;
307 mphf->data = fchf;
308 mphf->size = fch->m;
309 //DEBUGP("Successfully generated minimal perfect hash\n");
310 if (mph->verbosity)
311 {
312 fprintf(stderr, "Successfully generated minimal perfect hash function\n");
313 }
314 return mphf;
315 }
316
317 int fch_dump(cmph_t *mphf, FILE *fd)
318 {
319 char *buf = NULL;
320 cmph_uint32 buflen;
321 register size_t nbytes;
322
323 fch_data_t *data = (fch_data_t *)mphf->data;
324
325 #ifdef DEBUG
326 cmph_uint32 i;
327 #endif
328 __cmph_dump(mphf, fd);
329
330 hash_state_dump(data->h1, &buf, &buflen);
331 //DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
332 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
333 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
334 free(buf);
335
336 hash_state_dump(data->h2, &buf, &buflen);
337 //DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
338 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
339 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
340 free(buf);
341
342 nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
343 nbytes = fwrite(&(data->c), sizeof(double), (size_t)1, fd);
344 nbytes = fwrite(&(data->b), sizeof(cmph_uint32), (size_t)1, fd);
345 nbytes = fwrite(&(data->p1), sizeof(double), (size_t)1, fd);
346 nbytes = fwrite(&(data->p2), sizeof(double), (size_t)1, fd);
347 nbytes = fwrite(data->g, sizeof(cmph_uint32)*(data->b), (size_t)1, fd);
348 if (nbytes == 0 && ferror(fd)) {
349 fprintf(stderr, "ERROR: %s\n", strerror(errno));
350 return 0;
351 }
352 #ifdef DEBUG
353 fprintf(stderr, "G: ");
354 for (i = 0; i < data->b; ++i) fprintf(stderr, "%u ", data->g[i]);
355 fprintf(stderr, "\n");
356 #endif
357 return 1;
358 }
359
360 void fch_load(FILE *f, cmph_t *mphf)
361 {
362 char *buf = NULL;
363 cmph_uint32 buflen;
364 register size_t nbytes;
365 fch_data_t *fch = (fch_data_t *)malloc(sizeof(fch_data_t));
366 #ifdef DEBUG
367 cmph_uint32 i;
368 #endif
369
370 //DEBUGP("Loading fch mphf\n");
371 mphf->data = fch;
372 //DEBUGP("Reading h1\n");
373 fch->h1 = NULL;
374 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
375 //DEBUGP("Hash state of h1 has %u bytes\n", buflen);
376 buf = (char *)malloc((size_t)buflen);
377 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
378 fch->h1 = hash_state_load(buf, buflen);
379 free(buf);
380
381 //DEBUGP("Loading fch mphf\n");
382 mphf->data = fch;
383 //DEBUGP("Reading h2\n");
384 fch->h2 = NULL;
385 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
386 //DEBUGP("Hash state of h2 has %u bytes\n", buflen);
387 buf = (char *)malloc((size_t)buflen);
388 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
389 fch->h2 = hash_state_load(buf, buflen);
390 free(buf);
391
392
393 //DEBUGP("Reading m and n\n");
394 nbytes = fread(&(fch->m), sizeof(cmph_uint32), (size_t)1, f);
395 nbytes = fread(&(fch->c), sizeof(double), (size_t)1, f);
396 nbytes = fread(&(fch->b), sizeof(cmph_uint32), (size_t)1, f);
397 nbytes = fread(&(fch->p1), sizeof(double), (size_t)1, f);
398 nbytes = fread(&(fch->p2), sizeof(double), (size_t)1, f);
399
400 fch->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*fch->b);
401 nbytes = fread(fch->g, fch->b*sizeof(cmph_uint32), (size_t)1, f);
402 if (nbytes == 0 && ferror(f)) {
403 fprintf(stderr, "ERROR: %s\n", strerror(errno));
404 return;
405 }
406
407 #ifdef DEBUG
408 fprintf(stderr, "G: ");
409 for (i = 0; i < fch->b; ++i) fprintf(stderr, "%u ", fch->g[i]);
410 fprintf(stderr, "\n");
411 #endif
412 return;
413 }
414
415 cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
416 {
417 fch_data_t *fch = mphf->data;
418 cmph_uint32 h1 = hash(fch->h1, key, keylen) % fch->m;
419 cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m;
420 h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1);
421 //DEBUGP("key: %s h1: %u h2: %u g[h1]: %u\n", key, h1, h2, fch->g[h1]);
422 return (h2 + fch->g[h1]) % fch->m;
423 }
424 void fch_destroy(cmph_t *mphf)
425 {
426 fch_data_t *data = (fch_data_t *)mphf->data;
427 free(data->g);
428 hash_state_destroy(data->h1);
429 hash_state_destroy(data->h2);
430 free(data);
431 free(mphf);
432 }
433
434 /** \fn void fch_pack(cmph_t *mphf, void *packed_mphf);
435 * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
436 * \param mphf pointer to the resulting mphf
437 * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
438 */
439 void fch_pack(cmph_t *mphf, void *packed_mphf)
440 {
441 fch_data_t *data = (fch_data_t *)mphf->data;
442 cmph_uint8 * ptr = packed_mphf;
443
444 // packing h1 type
445 CMPH_HASH h1_type = hash_get_type(data->h1);
446 CMPH_HASH h2_type;
447 *((cmph_uint32 *) ptr) = h1_type;
448 ptr += sizeof(cmph_uint32);
449
450 // packing h1
451 hash_state_pack(data->h1, ptr);
452 ptr += hash_state_packed_size(h1_type);
453
454 // packing h2 type
455 h2_type = hash_get_type(data->h2);
456 *((cmph_uint32 *) ptr) = h2_type;
457 ptr += sizeof(cmph_uint32);
458
459 // packing h2
460 hash_state_pack(data->h2, ptr);
461 ptr += hash_state_packed_size(h2_type);
462
463 // packing m
464 *((cmph_uint32 *) ptr) = data->m;
465 ptr += sizeof(data->m);
466
467 // packing b
468 *((cmph_uint32 *) ptr) = data->b;
469 ptr += sizeof(data->b);
470
471 // packing p1
472 *((cmph_uint64 *)ptr) = (cmph_uint64)data->p1;
473 ptr += sizeof(data->p1);
474
475 // packing p2
476 *((cmph_uint64 *)ptr) = (cmph_uint64)data->p2;
477 ptr += sizeof(data->p2);
478
479 // packing g
480 memcpy(ptr, data->g, sizeof(cmph_uint32)*(data->b));
481 }
482
483 /** \fn cmph_uint32 fch_packed_size(cmph_t *mphf);
484 * \brief Return the amount of space needed to pack mphf.
485 * \param mphf pointer to a mphf
486 * \return the size of the packed function or zero for failures
487 */
488 cmph_uint32 fch_packed_size(cmph_t *mphf)
489 {
490 fch_data_t *data = (fch_data_t *)mphf->data;
491 CMPH_HASH h1_type = hash_get_type(data->h1);
492 CMPH_HASH h2_type = hash_get_type(data->h2);
493
494 return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
495 4*sizeof(cmph_uint32) + 2*sizeof(double) + sizeof(cmph_uint32)*(data->b));
496 }
497
498
499 /** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
500 * \brief Use the packed mphf to do a search.
501 * \param packed_mphf pointer to the packed mphf
502 * \param key key to be hashed
503 * \param keylen key legth in bytes
504 * \return The mphf value
505 */
506 cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
507 {
508 register cmph_uint8 *h1_ptr = packed_mphf;
509 register CMPH_HASH h1_type = *((cmph_uint32 *)h1_ptr);
510 register cmph_uint8 *h2_ptr;
511 register CMPH_HASH h2_type;
512 register cmph_uint32 *g_ptr;
513 register cmph_uint32 m, b, h1, h2;
514 register double p1, p2;
515
516 h1_ptr += 4;
517
518 h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
519 h2_type = *((cmph_uint32 *)h2_ptr);
520 h2_ptr += 4;
521
522 g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type));
523
524 m = *g_ptr++;
525
526 b = *g_ptr++;
527
528 p1 = (double)(*((cmph_uint64 *)g_ptr));
529 g_ptr += 2;
530
531 p2 = (double)(*((cmph_uint64 *)g_ptr));
532 g_ptr += 2;
533
534 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m;
535 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m;
536 h1 = mixh10h11h12 (b, p1, p2, h1);
537 return (h2 + g_ptr[h1]) % m;
538 }
539