1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<string.h>
4 #include<math.h>
5 #include<time.h>
6 #include<assert.h>
7 #include<limits.h>
8 #include<errno.h>
9
10 #include "cmph_structs.h"
11 #include "chd_structs_ph.h"
12 #include "chd_ph.h"
13 #include"miller_rabin.h"
14 #include"bitbool.h"
15
16
17 //#define DEBUG
18 #include "debug.h"
19
20 // NO_ELEMENT is equivalent to null pointer
21 #ifndef NO_ELEMENT
22 #define NO_ELEMENT UINT_MAX
23 #endif
24
25 // struct used to represent items at mapping, ordering and searching phases
26 struct _chd_ph_item_t
27 {
28 cmph_uint32 f;
29 cmph_uint32 h;
30 };
31 typedef struct _chd_ph_item_t chd_ph_item_t;
32
33 // struct to represent the items at mapping phase only.
34 struct _chd_ph_map_item_t
35 {
36 cmph_uint32 f;
37 cmph_uint32 h;
38 cmph_uint32 bucket_num;
39 };
40 typedef struct _chd_ph_map_item_t chd_ph_map_item_t;
41
42 // struct to represent a bucket
43 struct _chd_ph_bucket_t
44 {
45 cmph_uint32 items_list; // offset
46 union
47 {
48 cmph_uint32 size;
49 cmph_uint32 bucket_id;
50 };
51 };
52
53 typedef struct _chd_ph_bucket_t chd_ph_bucket_t;
54
55 struct _chd_ph_sorted_list_t
56 {
57 cmph_uint32 buckets_list;
58 cmph_uint32 size;
59 };
60
61 typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t;
62
63
64 static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets);
65 static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets);
66 static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets);
67
68 chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets)
69 {
70 chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t));
71 return buckets;
72 }
73
74 void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets)
75 {
76 register cmph_uint32 i = 0;
77 assert(buckets);
78 for(i = 0; i < nbuckets; i++)
79 buckets[i].size = 0;
80 }
81 static cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items,
82 cmph_uint32 nbuckets,cmph_uint32 item_idx)
83 {
84 register cmph_uint32 i = 0;
85 register chd_ph_item_t * tmp_item;
86 register chd_ph_map_item_t * tmp_map_item = map_items + item_idx;
87 register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num;
88 tmp_item = items + bucket->items_list;
89
90 for(i = 0; i < bucket->size; i++)
91 {
92 if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h)
93 {
94 DEBUGP("Item not added\n");
95 return 0;
96 };
97 tmp_item++;
98 };
99 tmp_item->f = tmp_map_item->f;
100 tmp_item->h = tmp_map_item->h;
101 bucket->size++;
102 return 1;
103 };
104 void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)
105 {
106 free(buckets);
107 }
108
109 static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items,
110 cmph_uint32 *max_bucket_size);
111
112 static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items,
113 cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size);
114
115 static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
116 cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table);
117
118 static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
119 {
120 double r = _r, n = _n;
121 return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2);
122 };
123
124 /* computes the entropy of non empty buckets.*/
125 static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes)
126 {
127 register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32));
128 register cmph_uint32 i;
129 register double entropy = 0;
130
131 for(i = 0; i < n; i++)
132 {
133 probe_counts[disp_table[i]]++;
134 };
135
136 for(i = 0; i < max_probes; i++)
137 {
138 if(probe_counts[i] > 0)
139 entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2);
140 };
141 free(probe_counts);
142 return entropy;
143 };
144
145 chd_ph_config_data_t *chd_ph_config_new(void)
146 {
147 chd_ph_config_data_t *chd_ph;
148 chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t));
149 assert(chd_ph);
150 memset(chd_ph, 0, sizeof(chd_ph_config_data_t));
151
152 chd_ph->hashfunc = CMPH_HASH_JENKINS;
153 chd_ph->cs = NULL;
154 chd_ph->nbuckets = 0;
155 chd_ph->n = 0;
156 chd_ph->hl = NULL;
157
158 chd_ph->m = 0;
159 chd_ph->use_h = 1;
160 chd_ph->keys_per_bin = 1;
161 chd_ph->keys_per_bucket = 4;
162 chd_ph->occup_table = 0;
163
164 return chd_ph;
165 }
166
167 void chd_ph_config_destroy(cmph_config_t *mph)
168 {
169 chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
170 DEBUGP("Destroying algorithm dependent data\n");
171 if(data->occup_table)
172 {
173 free(data->occup_table);
174 data->occup_table = NULL;
175 }
176 free(data);
177 }
178
179
180 void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
181 {
182 chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
183 CMPH_HASH *hashptr = hashfuncs;
184 cmph_uint32 i = 0;
185 while(*hashptr != CMPH_HASH_COUNT)
186 {
187 if (i >= 1) break; //chd_ph only uses one linear hash function
188 chd_ph->hashfunc = *hashptr;
189 ++i, ++hashptr;
190 }
191 }
192
193
194 void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
195 {
196 chd_ph_config_data_t *chd_ph;
197 assert(mph);
198 chd_ph = (chd_ph_config_data_t *)mph->data;
199 if(keys_per_bucket < 1 || keys_per_bucket >= 15)
200 {
201 keys_per_bucket = 4;
202 }
203 chd_ph->keys_per_bucket = keys_per_bucket;
204 }
205
206
207 void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
208 {
209 chd_ph_config_data_t *chd_ph;
210 assert(mph);
211 chd_ph = (chd_ph_config_data_t *)mph->data;
212 if(keys_per_bin <= 1 || keys_per_bin >= 128)
213 {
214 keys_per_bin = 1;
215 }
216 chd_ph->keys_per_bin = keys_per_bin;
217 }
218
219 cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size)
220 {
221 register cmph_uint32 i = 0, g = 0;
222 cmph_uint32 hl[3];
223 chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
224 char * key = NULL;
225 cmph_uint32 keylen = 0;
226 chd_ph_map_item_t * map_item;
227 chd_ph_map_item_t * map_items = malloc(chd_ph->m*sizeof(chd_ph_map_item_t));
228 register cmph_uint32 mapping_iterations = 1000;
229 *max_bucket_size = 0;
230 while(1)
231 {
232 mapping_iterations--;
233 if (chd_ph->hl) hash_state_destroy(chd_ph->hl);
234 chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m);
235
236 chd_ph_bucket_clean(buckets, chd_ph->nbuckets);
237
238 mph->key_source->rewind(mph->key_source->data);
239
240 for(i = 0; i < chd_ph->m; i++)
241 {
242 mph->key_source->read(mph->key_source->data, &key, &keylen);
243 hash_vector(chd_ph->hl, key, keylen, hl);
244
245 map_item = (map_items + i);
246
247 g = hl[0] % chd_ph->nbuckets;
248 map_item->f = hl[1] % chd_ph->n;
249 map_item->h = hl[2] % (chd_ph->n - 1) + 1;
250 map_item->bucket_num=g;
251 mph->key_source->dispose(mph->key_source->data, key, keylen);
252 // if(buckets[g].size == (chd_ph->keys_per_bucket << 2))
253 // {
254 // DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2));
255 // goto error;
256 // }
257 buckets[g].size++;
258 if(buckets[g].size > *max_bucket_size)
259 {
260 *max_bucket_size = buckets[g].size;
261 }
262 }
263 buckets[0].items_list = 0;
264 for(i = 1; i < chd_ph->nbuckets; i++)
265 {
266 buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size;
267 buckets[i - 1].size = 0;
268 };
269 buckets[i - 1].size = 0;
270 for(i = 0; i < chd_ph->m; i++)
271 {
272 map_item = (map_items + i);
273 if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i))
274 break;
275 }
276 if(i == chd_ph->m)
277 {
278 free(map_items);
279 return 1; // SUCCESS
280 }
281
282 if(mapping_iterations == 0)
283 {
284 goto error;
285 }
286 }
287 error:
288 free(map_items);
289 hash_state_destroy(chd_ph->hl);
290 chd_ph->hl = NULL;
291 return 0; // FAILURE
292 }
293
294 chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items,
295 cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size)
296 {
297 chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t));
298
299 chd_ph_bucket_t * input_buckets = (*_buckets);
300 chd_ph_bucket_t * output_buckets;
301 chd_ph_item_t * input_items = (*_items);
302 chd_ph_item_t * output_items;
303 register cmph_uint32 i, j, bucket_size, position, position2;
304 // cmph_uint32 non_empty_buckets;
305 DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size);
306 // Determine size of each list of buckets
307 for(i = 0; i < nbuckets; i++)
308 {
309 bucket_size = input_buckets[i].size;
310 if(bucket_size == 0)
311 continue;
312 sorted_lists[bucket_size].size++;
313 };
314 sorted_lists[1].buckets_list = 0;
315 // Determine final position of list of buckets into the contiguous array that will store all the buckets
316 for(i = 2; i <= max_bucket_size; i++)
317 {
318 sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size;
319 sorted_lists[i-1].size = 0;
320 };
321 sorted_lists[i-1].size = 0;
322 // Store the buckets in a new array which is sorted by bucket sizes
323 output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero
324 // non_empty_buckets = nbuckets;
325
326 for(i = 0; i < nbuckets; i++)
327 {
328 bucket_size = input_buckets[i].size;
329 if(bucket_size == 0)
330 {
331 // non_empty_buckets--;
332 continue;
333 };
334 position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size;
335 output_buckets[position].bucket_id = i;
336 output_buckets[position].items_list = input_buckets[i].items_list;
337 sorted_lists[bucket_size].size++;
338 };
339 /* for(i = non_empty_buckets; i < nbuckets; i++)
340 output_buckets[i].size=0;*/
341 // Return the buckets sorted in new order and free the old buckets sorted in old order
342 free(input_buckets);
343 (*_buckets) = output_buckets;
344
345
346 // Store the items according to the new order of buckets.
347 output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t));
348 position = 0;
349 i = 0;
350 for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
351 {
352 for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++)
353 {
354 position2 = output_buckets[i].items_list;
355 output_buckets[i].items_list = position;
356 for(j = 0; j < bucket_size; j++)
357 {
358 output_items[position].f = input_items[position2].f;
359 output_items[position].h = input_items[position2].h;
360 position++;
361 position2++;
362 };
363 };
364 };
365 //Return the items sorted in new order and free the old items sorted in old order
366 free(input_items);
367 (*_items) = output_items;
368 return sorted_lists;
369 };
370
371 static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
372 chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num,
373 cmph_uint32 bucket_num, cmph_uint32 size)
374 {
375 register cmph_uint32 i;
376 register chd_ph_item_t * item;
377 register cmph_uint32 position;
378
379 item = items + buckets[bucket_num].items_list;
380 // try place bucket with probe_num
381 if(chd_ph->keys_per_bin > 1)
382 {
383 for(i = 0; i < size; i++) // placement
384 {
385 position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
386 if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
387 {
388 break;
389 }
390 (chd_ph->occup_table[position])++;
391 item++;
392 };
393 } else
394 {
395 for(i = 0; i < size; i++) // placement
396 {
397 position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
398 if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position))
399 {
400 break;
401 }
402 SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
403 item++;
404 };
405 };
406 if(i != size) // Undo the placement
407 {
408 item = items + buckets[bucket_num].items_list;
409 if(chd_ph->keys_per_bin > 1)
410 {
411 while(1)
412 {
413 if(i == 0)
414 {
415 break;
416 }
417 position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
418 (chd_ph->occup_table[position])--;
419 item++;
420 i--;
421 };
422 } else
423 {
424 while(1)
425 {
426 if(i == 0)
427 {
428 break;
429 }
430 position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
431 UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
432
433 // ([position/32]^=(1<<(position%32));
434 item++;
435 i--;
436 };
437 };
438 return 0;
439 }
440 return 1;
441 };
442
443 static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes,
444 cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size)
445
446 {
447 register cmph_uint32 probe0_num, probe1_num, probe_num;
448 probe0_num = 0;
449 probe1_num = 0;
450 probe_num = 0;
451
452 while(1)
453 {
454 if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size))
455 {
456 disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n;
457 return 1;
458 }
459 probe0_num++;
460 if(probe0_num >= chd_ph->n)
461 {
462 probe0_num -= chd_ph->n;
463 probe1_num++;
464 };
465 probe_num++;
466 if(probe_num >= max_probes || probe1_num >= chd_ph->n)
467 {
468 return 0;
469 };
470 };
471 return 0;
472 };
473
474 static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items,
475 cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
476 cmph_uint32 * disp_table)
477 {
478 register cmph_uint32 i = 0;
479 register cmph_uint32 curr_bucket = 0;
480
481 for(i = max_bucket_size; i > 0; i--)
482 {
483 curr_bucket = sorted_lists[i].buckets_list;
484 while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list)
485 {
486 if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i))
487 {
488 return 0;
489 }
490 curr_bucket++;
491 };
492 };
493 return 1;
494 };
495
496 static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items,
497 cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
498 cmph_uint32 * disp_table)
499 {
500 register cmph_uint32 i,j, non_placed_bucket;
501 register cmph_uint32 curr_bucket;
502 register cmph_uint32 probe_num, probe0_num, probe1_num;
503 cmph_uint32 sorted_list_size;
504 #ifdef DEBUG
505 cmph_uint32 items_list;
506 cmph_uint32 bucket_id;
507 #endif
508 DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n");
509 for(i = max_bucket_size; i > 0; i--)
510 {
511 probe_num = 0;
512 probe0_num = 0;
513 probe1_num = 0;
514 sorted_list_size = sorted_lists[i].size;
515 while(sorted_lists[i].size != 0)
516 {
517 curr_bucket = sorted_lists[i].buckets_list;
518 for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++)
519 {
520 // if bucket is successfully placed remove it from list
521 if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i))
522 {
523 disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n;
524 // DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
525 }
526 else
527 {
528 // DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket);
529 #ifdef DEBUG
530 items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list;
531 bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id;
532 #endif
533 buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list;
534 buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id;
535 #ifdef DEBUG
536 buckets[curr_bucket].items_list=items_list;
537 buckets[curr_bucket].bucket_id=bucket_id;
538 #endif
539 non_placed_bucket++;
540 }
541 curr_bucket++;
542 };
543 sorted_lists[i].size = non_placed_bucket;
544 probe0_num++;
545 if(probe0_num >= chd_ph->n)
546 {
547 probe0_num -= chd_ph->n;
548 probe1_num++;
549 };
550 probe_num++;
551 if(probe_num >= max_probes || probe1_num >= chd_ph->n)
552 {
553 sorted_lists[i].size = sorted_list_size;
554 return 0;
555 };
556 };
557 sorted_lists[i].size = sorted_list_size;
558 };
559 return 1;
560 };
561
562 cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
563 cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
564 cmph_uint32 * disp_table)
565 {
566 if(chd_ph->use_h)
567 {
568 return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
569 }
570 else
571 {
572 return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
573 }
574
575 }
576
577 static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items,
578 cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size)
579 {
580 register cmph_uint32 bucket_size, i, j;
581 register cmph_uint32 position, probe0_num, probe1_num;
582 G_GNUC_UNUSED register cmph_uint32 m = 0;
583 register chd_ph_item_t * item;
584 if(chd_ph->keys_per_bin > 1)
585 memset(chd_ph->occup_table, 0, chd_ph->n);
586 else
587 memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
588
589 for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
590 for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size +
591 sorted_lists[bucket_size].buckets_list; i++)
592 {
593 j = bucket_size;
594 item = items + buckets[i].items_list;
595 probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n;
596 probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n;
597 for(; j > 0; j--)
598 {
599 #ifdef DEBUG
600 m++;
601 #endif
602 position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
603 if(chd_ph->keys_per_bin > 1)
604 {
605 if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
606 {
607 return 0;
608 }
609 (chd_ph->occup_table[position])++;
610 }
611 else
612 {
613 if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position))
614 {
615 return 0;
616 }
617 SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
618 };
619 item++;
620 };
621 };
622 DEBUGP("We were able to place m = %u keys\n", m);
623 return 1;
624 };
625
626
627 cmph_t *chd_ph_new(cmph_config_t *mph, double c)
628 {
629 cmph_t *mphf = NULL;
630 chd_ph_data_t *chd_phf = NULL;
631 chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
632
633 register double load_factor = c;
634 register cmph_uint8 searching_success = 0;
635 register cmph_uint32 max_probes = 1 << 20; // default value for max_probes
636 register cmph_uint32 iterations = 100;
637 chd_ph_bucket_t * buckets = NULL;
638 chd_ph_item_t * items = NULL;
639 register cmph_uint8 failure = 0;
640 cmph_uint32 max_bucket_size = 0;
641 chd_ph_sorted_list_t * sorted_lists = NULL;
642 cmph_uint32 * disp_table = NULL;
643 register double space_lower_bound = 0;
644 #ifdef CMPH_TIMING
645 double construction_time_begin = 0.0;
646 double construction_time = 0.0;
647 ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
648 #endif
649
650
651 chd_ph->m = mph->key_source->nkeys;
652 DEBUGP("m = %u\n", chd_ph->m);
653
654 chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
655 DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
656
657 if(load_factor < 0.5 )
658 {
659 load_factor = 0.5;
660 }
661
662 if(load_factor >= 0.99)
663 {
664 load_factor = 0.99;
665 }
666
667 DEBUGP("load_factor = %.3f\n", load_factor);
668
669 chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
670
671 //Round the number of bins to the prime immediately above
672 if(chd_ph->n % 2 == 0) chd_ph->n++;
673 for(;;)
674 {
675 if(check_primality(chd_ph->n) == 1)
676 break;
677 chd_ph->n += 2; // just odd numbers can be primes for n > 2
678
679 };
680
681 DEBUGP("n = %u \n", chd_ph->n);
682 if(chd_ph->keys_per_bin == 1)
683 {
684 space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
685 }
686
687 if(mph->verbosity)
688 {
689 fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
690 }
691
692 // We allocate the working tables
693 buckets = chd_ph_bucket_new(chd_ph->nbuckets);
694 items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
695
696 max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
697
698 if(chd_ph->keys_per_bin == 1)
699 chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));
700 else
701 chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
702
703 disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
704 //
705 // init_genrand(time(0));
706
707 while(1)
708 {
709 iterations --;
710 if (mph->verbosity)
711 {
712 fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
713 }
714
715 if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
716 {
717 if (mph->verbosity)
718 {
719 fprintf(stderr, "Failure in mapping step\n");
720 }
721 failure = 1;
722 goto cleanup;
723 }
724
725 if (mph->verbosity)
726 {
727 fprintf(stderr, "Starting ordering step\n");
728 }
729 if(sorted_lists)
730 {
731 free(sorted_lists);
732 }
733
734 sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size);
735
736 if (mph->verbosity)
737 {
738 fprintf(stderr, "Starting searching step\n");
739 }
740
741 searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
742 if(searching_success) break;
743
744 // reset occup_table
745 if(chd_ph->keys_per_bin > 1)
746 memset(chd_ph->occup_table, 0, chd_ph->n);
747 else
748 memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
749 if(iterations == 0)
750 {
751 // Cleanup memory
752 if (mph->verbosity)
753 {
754 fprintf(stderr, "Failure because the max trials was exceeded\n");
755 }
756 failure = 1;
757 goto cleanup;
758 };
759 }
760
761 #ifdef DEBUG
762 {
763 if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size))
764 {
765
766 DEBUGP("Error for bin packing generation");
767 failure = 1;
768 goto cleanup;
769 }
770 }
771 #endif
772
773 if (mph->verbosity)
774 {
775 fprintf(stderr, "Starting compressing step\n");
776 }
777
778 if(chd_ph->cs)
779 {
780 free(chd_ph->cs);
781 }
782 chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
783 compressed_seq_init(chd_ph->cs);
784 compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
785
786 #ifdef CMPH_TIMING
787 ELAPSED_TIME_IN_SECONDS(&construction_time);
788 register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
789 DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
790 #endif
791
792 cleanup:
793 chd_ph_bucket_destroy(buckets);
794 free(items);
795 free(sorted_lists);
796 free(disp_table);
797 if(failure)
798 {
799 if(chd_ph->hl)
800 {
801 hash_state_destroy(chd_ph->hl);
802 }
803 chd_ph->hl = NULL;
804 return NULL;
805 }
806
807 mphf = (cmph_t *)malloc(sizeof(cmph_t));
808 mphf->algo = mph->algo;
809 chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
810
811 chd_phf->cs = chd_ph->cs;
812 chd_ph->cs = NULL; //transfer memory ownership
813 chd_phf->hl = chd_ph->hl;
814 chd_ph->hl = NULL; //transfer memory ownership
815 chd_phf->n = chd_ph->n;
816 chd_phf->nbuckets = chd_ph->nbuckets;
817
818 mphf->data = chd_phf;
819 mphf->size = chd_ph->n;
820
821 DEBUGP("Successfully generated minimal perfect hash\n");
822 if (mph->verbosity)
823 {
824 fprintf(stderr, "Successfully generated minimal perfect hash function\n");
825 }
826
827 #ifdef CMPH_TIMING
828 register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
829 construction_time = construction_time - construction_time_begin;
830 fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
831 #endif
832
833 return mphf;
834 }
835
836
837
838 void chd_ph_load(FILE *fd, cmph_t *mphf)
839 {
840 char *buf = NULL;
841 cmph_uint32 buflen;
842 register size_t nbytes;
843 chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
844
845 DEBUGP("Loading chd_ph mphf\n");
846 mphf->data = chd_ph;
847
848 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
849 DEBUGP("Hash state has %u bytes\n", buflen);
850 buf = (char *)malloc((size_t)buflen);
851 nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
852 chd_ph->hl = hash_state_load(buf, buflen);
853 free(buf);
854
855 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
856 DEBUGP("Compressed sequence structure has %u bytes\n", buflen);
857 buf = (char *)malloc((size_t)buflen);
858 nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
859 chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
860 compressed_seq_load(chd_ph->cs, buf, buflen);
861 free(buf);
862
863 // loading n and nbuckets
864 DEBUGP("Reading n and nbuckets\n");
865 nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd);
866 nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
867 if (nbytes == 0 && ferror(fd)) {
868 fprintf(stderr, "ERROR: %s\n", strerror(errno));
869 }
870
871 }
872
873 int chd_ph_dump(cmph_t *mphf, FILE *fd)
874 {
875 char *buf = NULL;
876 cmph_uint32 buflen;
877 register size_t nbytes;
878 chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
879
880 __cmph_dump(mphf, fd);
881
882 hash_state_dump(data->hl, &buf, &buflen);
883 DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
884 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
885 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
886 free(buf);
887
888 compressed_seq_dump(data->cs, &buf, &buflen);
889 DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen);
890 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
891 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
892 free(buf);
893
894 // dumping n and nbuckets
895 nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
896 nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
897 if (nbytes == 0 && ferror(fd)) {
898 fprintf(stderr, "ERROR: %s\n", strerror(errno));
899 return 0;
900 }
901 return 1;
902 }
903
904 void chd_ph_destroy(cmph_t *mphf)
905 {
906 chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
907 compressed_seq_destroy(data->cs);
908 free(data->cs);
909 hash_state_destroy(data->hl);
910 free(data);
911 free(mphf);
912
913 }
914
915 cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
916 {
917 register chd_ph_data_t * chd_ph = mphf->data;
918 cmph_uint32 hl[3];
919 register cmph_uint32 disp,position;
920 register cmph_uint32 probe0_num,probe1_num;
921 register cmph_uint32 f,g,h;
922 hash_vector(chd_ph->hl, key, keylen, hl);
923 g = hl[0] % chd_ph->nbuckets;
924 f = hl[1] % chd_ph->n;
925 h = hl[2] % (chd_ph->n-1) + 1;
926
927 disp = compressed_seq_query(chd_ph->cs, g);
928 probe0_num = disp % chd_ph->n;
929 probe1_num = disp/chd_ph->n;
930 position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n);
931 return position;
932 }
933
934 void chd_ph_pack(cmph_t *mphf, void *packed_mphf)
935 {
936 chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
937 cmph_uint8 * ptr = packed_mphf;
938
939 // packing hl type
940 CMPH_HASH hl_type = hash_get_type(data->hl);
941 *((cmph_uint32 *) ptr) = hl_type;
942 ptr += sizeof(cmph_uint32);
943
944 // packing hl
945 hash_state_pack(data->hl, ptr);
946 ptr += hash_state_packed_size(hl_type);
947
948 // packing n
949 *((cmph_uint32 *) ptr) = data->n;
950 ptr += sizeof(data->n);
951
952 // packing nbuckets
953 *((cmph_uint32 *) ptr) = data->nbuckets;
954 ptr += sizeof(data->nbuckets);
955
956 // packing cs
957 compressed_seq_pack(data->cs, ptr);
958 //ptr += compressed_seq_packed_size(data->cs);
959
960 }
961
962 cmph_uint32 chd_ph_packed_size(cmph_t *mphf)
963 {
964 register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
965 register CMPH_HASH hl_type = hash_get_type(data->hl);
966 register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type);
967 register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs);
968
969 return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32));
970
971 }
972
973 cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
974 {
975 register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf;
976 register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
977
978 register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type));
979 register cmph_uint32 n = *ptr++;
980 register cmph_uint32 nbuckets = *ptr++;
981 cmph_uint32 hl[3];
982
983 register cmph_uint32 disp,position;
984 register cmph_uint32 probe0_num,probe1_num;
985 register cmph_uint32 f,g,h;
986
987 hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
988
989 g = hl[0] % nbuckets;
990 f = hl[1] % n;
991 h = hl[2] % (n-1) + 1;
992
993 disp = compressed_seq_query_packed(ptr, g);
994 probe0_num = disp % n;
995 probe1_num = disp/n;
996 position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n);
997 return position;
998 }
999
1000
1001