1 #include "graph.h"
2 #include "fch.h"
3 #include "fch_structs.h"
4 #include "bmz8.h"
5 #include "bmz8_structs.h"
6 #include "brz.h"
7 #include "cmph_structs.h"
8 #include "brz_structs.h"
9 #include "buffer_manager.h"
10 #include "cmph.h"
11 #include "hash.h"
12 #include "bitbool.h"
13 #include <math.h>
14 #include <stdlib.h>
15 #include <stdio.h>
16 #include <assert.h>
17 #include <string.h>
18 #include <errno.h>
19 #define MAX_BUCKET_SIZE 255
20 //#define DEBUG
21 #include "debug.h"
22
23 #if defined (__ia64) || defined (__x86_64__) || defined (_WIN64)
24 # define __brz_use_64bit__
25 #endif
26
27 static int brz_gen_mphf(cmph_config_t *mph);
28 static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n);
29 static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys);
30 static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen);
31 static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen);
32 brz_config_data_t *brz_config_new(void)
33 {
34 brz_config_data_t *brz = NULL;
35 brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t));
36 brz->algo = CMPH_FCH;
37 brz->b = 128;
38 brz->hashfuncs[0] = CMPH_HASH_JENKINS;
39 brz->hashfuncs[1] = CMPH_HASH_JENKINS;
40 brz->hashfuncs[2] = CMPH_HASH_JENKINS;
41 brz->size = NULL;
42 brz->offset = NULL;
43 brz->g = NULL;
44 brz->h1 = NULL;
45 brz->h2 = NULL;
46 brz->h0 = NULL;
47 brz->memory_availability = 1024*1024;
48 brz->tmp_dir = (cmph_uint8 *)calloc((size_t)10, sizeof(cmph_uint8));
49 brz->mphf_fd = NULL;
50 strcpy((char *)(brz->tmp_dir), "/var/tmp/");
51 assert(brz);
52 return brz;
53 }
54
55 void brz_config_destroy(cmph_config_t *mph)
56 {
57 brz_config_data_t *data = (brz_config_data_t *)mph->data;
58 free(data->tmp_dir);
59 DEBUGP("Destroying algorithm dependent data\n");
60 free(data);
61 }
62
63 void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
64 {
65 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
66 CMPH_HASH *hashptr = hashfuncs;
67 cmph_uint32 i = 0;
68 while(*hashptr != CMPH_HASH_COUNT)
69 {
70 if (i >= 3) break; //brz only uses three hash functions
71 brz->hashfuncs[i] = *hashptr;
72 ++i, ++hashptr;
73 }
74 }
75
76 void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability)
77 {
78 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
79 if(memory_availability > 0) brz->memory_availability = memory_availability*1024*1024;
80 }
81
82 void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir)
83 {
84 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
85 if(tmp_dir)
86 {
87 size_t len = strlen((char *)tmp_dir);
88 free(brz->tmp_dir);
89 if(tmp_dir[len-1] != '/')
90 {
91 brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+2, sizeof(cmph_uint8));
92 sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir);
93 }
94 else
95 {
96 brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+1, sizeof(cmph_uint8));
97 sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir);
98 }
99
100 }
101 }
102
103 void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd)
104 {
105 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
106 brz->mphf_fd = mphf_fd;
107 assert(brz->mphf_fd);
108 }
109
110 void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b)
111 {
112 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
113 if(b <= 64 || b >= 175)
114 {
115 b = 128;
116 }
117 brz->b = (cmph_uint8)b;
118 }
119
120 void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
121 {
122 if (algo == CMPH_BMZ8 || algo == CMPH_FCH) // supported algorithms
123 {
124 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
125 brz->algo = algo;
126 }
127 }
128
129 cmph_t *brz_new(cmph_config_t *mph, double c)
130 {
131 cmph_t *mphf = NULL;
132 brz_data_t *brzf = NULL;
133 cmph_uint32 i;
134 cmph_uint32 iterations = 20;
135 brz_config_data_t *brz;
136
137 DEBUGP("c: %f\n", c);
138 brz = (brz_config_data_t *)mph->data;
139 switch(brz->algo) // validating restrictions over parameter c.
140 {
141 case CMPH_BMZ8:
142 if (c == 0 || c >= 2.0) c = 1;
143 break;
144 case CMPH_FCH:
145 if (c <= 2.0) c = 2.6;
146 break;
147 default:
148 assert(0);
149 }
150 brz->c = c;
151 brz->m = mph->key_source->nkeys;
152 DEBUGP("m: %u\n", brz->m);
153 brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b));
154 DEBUGP("k: %u\n", brz->k);
155 brz->size = (cmph_uint8 *) calloc((size_t)brz->k, sizeof(cmph_uint8));
156
157 // Clustering the keys by graph id.
158 if (mph->verbosity)
159 {
160 fprintf(stderr, "Partioning the set of keys.\n");
161 }
162
163 while(1)
164 {
165 int ok;
166 DEBUGP("hash function 3\n");
167 brz->h0 = hash_state_new(brz->hashfuncs[2], brz->k);
168 DEBUGP("Generating graphs\n");
169 ok = brz_gen_mphf(mph);
170 if (!ok)
171 {
172 --iterations;
173 hash_state_destroy(brz->h0);
174 brz->h0 = NULL;
175 DEBUGP("%u iterations remaining to create the graphs in a external file\n", iterations);
176 if (mph->verbosity)
177 {
178 fprintf(stderr, "Failure: A graph with more than 255 keys was created - %u iterations remaining\n", iterations);
179 }
180 if (iterations == 0) break;
181 }
182 else break;
183 }
184 if (iterations == 0)
185 {
186 DEBUGP("Graphs with more than 255 keys were created in all 20 iterations\n");
187 free(brz->size);
188 return NULL;
189 }
190 DEBUGP("Graphs generated\n");
191
192 brz->offset = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32));
193 for (i = 1; i < brz->k; ++i)
194 {
195 brz->offset[i] = brz->size[i-1] + brz->offset[i-1];
196 }
197 // Generating a mphf
198 mphf = (cmph_t *)malloc(sizeof(cmph_t));
199 mphf->algo = mph->algo;
200 brzf = (brz_data_t *)malloc(sizeof(brz_data_t));
201 brzf->g = brz->g;
202 brz->g = NULL; //transfer memory ownership
203 brzf->h1 = brz->h1;
204 brz->h1 = NULL; //transfer memory ownership
205 brzf->h2 = brz->h2;
206 brz->h2 = NULL; //transfer memory ownership
207 brzf->h0 = brz->h0;
208 brz->h0 = NULL; //transfer memory ownership
209 brzf->size = brz->size;
210 brz->size = NULL; //transfer memory ownership
211 brzf->offset = brz->offset;
212 brz->offset = NULL; //transfer memory ownership
213 brzf->k = brz->k;
214 brzf->c = brz->c;
215 brzf->m = brz->m;
216 brzf->algo = brz->algo;
217 mphf->data = brzf;
218 mphf->size = brz->m;
219 DEBUGP("Successfully generated minimal perfect hash\n");
220 if (mph->verbosity)
221 {
222 fprintf(stderr, "Successfully generated minimal perfect hash function\n");
223 }
224 return mphf;
225 }
226
227 static int brz_gen_mphf(cmph_config_t *mph)
228 {
229 cmph_uint32 i, e, error;
230 brz_config_data_t *brz = (brz_config_data_t *)mph->data;
231 cmph_uint32 memory_usage = 0;
232 cmph_uint32 nkeys_in_buffer = 0;
233 cmph_uint8 *buffer = (cmph_uint8 *)malloc((size_t)brz->memory_availability);
234 cmph_uint32 *buckets_size = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32));
235 cmph_uint32 *keys_index = NULL;
236 cmph_uint8 **buffer_merge = NULL;
237 cmph_uint32 *buffer_h0 = NULL;
238 cmph_uint32 nflushes = 0;
239 cmph_uint32 h0;
240 register size_t nbytes;
241 FILE * tmp_fd = NULL;
242 buffer_manager_t * buff_manager = NULL;
243 char *filename = NULL;
244 char *key = NULL;
245 cmph_uint32 keylen;
246 cmph_uint32 cur_bucket = 0;
247 cmph_uint8 nkeys_vd = 0;
248 cmph_uint8 ** keys_vd = NULL;
249
250 mph->key_source->rewind(mph->key_source->data);
251 DEBUGP("Generating graphs from %u keys\n", brz->m);
252 // Partitioning
253 for (e = 0; e < brz->m; ++e)
254 {
255 mph->key_source->read(mph->key_source->data, &key, &keylen);
256
257 /* Buffers management */
258 if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers
259 {
260 cmph_uint32 value, sum, keylen1;
261 if(mph->verbosity)
262 {
263 fprintf(stderr, "Flushing %u\n", nkeys_in_buffer);
264 }
265 value = buckets_size[0];
266 sum = 0;
267 keylen1 = 0;
268 buckets_size[0] = 0;
269 for(i = 1; i < brz->k; i++)
270 {
271 if(buckets_size[i] == 0) continue;
272 sum += value;
273 value = buckets_size[i];
274 buckets_size[i] = sum;
275
276 }
277 memory_usage = 0;
278 keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
279 for(i = 0; i < nkeys_in_buffer; i++)
280 {
281 memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
282 h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
283 keys_index[buckets_size[h0]] = memory_usage;
284 buckets_size[h0]++;
285 memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1);
286 }
287 filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
288 sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
289 tmp_fd = fopen(filename, "wb");
290 free(filename);
291 filename = NULL;
292 for(i = 0; i < nkeys_in_buffer; i++)
293 {
294 memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
295 nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
296 }
297 nkeys_in_buffer = 0;
298 memory_usage = 0;
299 memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
300 nflushes++;
301 free(keys_index);
302 fclose(tmp_fd);
303 }
304 memcpy(buffer + memory_usage, &keylen, sizeof(keylen));
305 memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen);
306 memory_usage += keylen + (cmph_uint32)sizeof(keylen);
307 h0 = hash(brz->h0, key, keylen) % brz->k;
308
309 if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0])))
310 {
311 free(buffer);
312 free(buckets_size);
313 return 0;
314 }
315 brz->size[h0] = (cmph_uint8)(brz->size[h0] + 1U);
316 buckets_size[h0] ++;
317 nkeys_in_buffer++;
318 mph->key_source->dispose(mph->key_source->data, key, keylen);
319 }
320 if (memory_usage != 0) // flush buffers
321 {
322 cmph_uint32 value;
323 cmph_uint32 sum, keylen1;
324 if(mph->verbosity)
325 {
326 fprintf(stderr, "Flushing %u\n", nkeys_in_buffer);
327 }
328 value = buckets_size[0];
329 sum = 0;
330 keylen1 = 0;
331 buckets_size[0] = 0;
332 for(i = 1; i < brz->k; i++)
333 {
334 if(buckets_size[i] == 0) continue;
335 sum += value;
336 value = buckets_size[i];
337 buckets_size[i] = sum;
338 }
339 memory_usage = 0;
340 keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
341 for(i = 0; i < nkeys_in_buffer; i++)
342 {
343 memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
344 h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
345 keys_index[buckets_size[h0]] = memory_usage;
346 buckets_size[h0]++;
347 memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1);
348 }
349 filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
350 sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
351 tmp_fd = fopen(filename, "wb");
352 free(filename);
353 filename = NULL;
354 for(i = 0; i < nkeys_in_buffer; i++)
355 {
356 memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
357 nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
358 }
359 nkeys_in_buffer = 0;
360 memory_usage = 0;
361 memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
362 nflushes++;
363 free(keys_index);
364 fclose(tmp_fd);
365 }
366
367 free(buffer);
368 free(buckets_size);
369 if(nflushes > 1024) return 0; // Too many files generated.
370 // mphf generation
371 if(mph->verbosity)
372 {
373 fprintf(stderr, "\nMPHF generation \n");
374 }
375 /* Starting to dump to disk the resultant MPHF: __cmph_dump function */
376 nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd);
377 nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd);
378 nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd);
379 nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd);
380 nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs
381 nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd);
382 if (nbytes == 0 && ferror(brz->mphf_fd)) {
383 fprintf(stderr, "ERROR: %s\n", strerror(errno));
384 return 0;
385 }
386
387 //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *));
388 buff_manager = buffer_manager_new(brz->memory_availability, nflushes);
389 buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *));
390 buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32));
391
392 memory_usage = 0;
393 for(i = 0; i < nflushes; i++)
394 {
395 filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
396 sprintf(filename, "%s%u.cmph",brz->tmp_dir, i);
397 buffer_manager_open(buff_manager, i, filename);
398 free(filename);
399 filename = NULL;
400 key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
401 h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
402 buffer_h0[i] = h0;
403 buffer_merge[i] = (cmph_uint8 *)key;
404 key = NULL; //transfer memory ownership
405 }
406 e = 0;
407 keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *));
408 nkeys_vd = 0;
409 error = 0;
410 while(e < brz->m)
411 {
412 i = brz_min_index(buffer_h0, nflushes);
413 cur_bucket = buffer_h0[i];
414 key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
415 if(key)
416 {
417 while(key)
418 {
419 //keylen = strlen(key);
420 h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
421 if (h0 != buffer_h0[i]) break;
422 keys_vd[nkeys_vd++] = (cmph_uint8 *)key;
423 key = NULL; //transfer memory ownership
424 e++;
425 key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
426 }
427 if (key)
428 {
429 assert(nkeys_vd < brz->size[cur_bucket]);
430 keys_vd[nkeys_vd++] = buffer_merge[i];
431 buffer_merge[i] = NULL; //transfer memory ownership
432 e++;
433 buffer_h0[i] = h0;
434 buffer_merge[i] = (cmph_uint8 *)key;
435 }
436 }
437 if(!key)
438 {
439 assert(nkeys_vd < brz->size[cur_bucket]);
440 keys_vd[nkeys_vd++] = buffer_merge[i];
441 buffer_merge[i] = NULL; //transfer memory ownership
442 e++;
443 buffer_h0[i] = UINT_MAX;
444 }
445
446 if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket.
447 {
448 cmph_io_adapter_t *source = NULL;
449 cmph_config_t *config = NULL;
450 cmph_t *mphf_tmp = NULL;
451 char *bufmphf = NULL;
452 cmph_uint32 buflenmphf = 0;
453 // Source of keys
454 source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd);
455 config = cmph_config_new(source);
456 cmph_config_set_algo(config, brz->algo);
457 //cmph_config_set_algo(config, CMPH_BMZ8);
458 cmph_config_set_graphsize(config, brz->c);
459 mphf_tmp = cmph_new(config);
460 if (mphf_tmp == NULL)
461 {
462 if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k);
463 error = 1;
464 cmph_config_destroy(config);
465 brz_destroy_keys_vd(keys_vd, nkeys_vd);
466 cmph_io_byte_vector_adapter_destroy(source);
467 break;
468 }
469 if(mph->verbosity)
470 {
471 if (cur_bucket % 1000 == 0)
472 {
473 fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k);
474 }
475 }
476 switch(brz->algo)
477 {
478 case CMPH_FCH:
479 {
480 fch_data_t * fchf = NULL;
481 fchf = (fch_data_t *)mphf_tmp->data;
482 bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf);
483 }
484 break;
485 case CMPH_BMZ8:
486 {
487 bmz8_data_t * bmzf = NULL;
488 bmzf = (bmz8_data_t *)mphf_tmp->data;
489 bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf);
490 }
491 break;
492 default: assert(0);
493 }
494 nbytes = fwrite(bufmphf, (size_t)buflenmphf, (size_t)1, brz->mphf_fd);
495 free(bufmphf);
496 bufmphf = NULL;
497 cmph_config_destroy(config);
498 brz_destroy_keys_vd(keys_vd, nkeys_vd);
499 cmph_destroy(mphf_tmp);
500 cmph_io_byte_vector_adapter_destroy(source);
501 nkeys_vd = 0;
502 }
503 }
504 buffer_manager_destroy(buff_manager);
505 free(keys_vd);
506 free(buffer_merge);
507 free(buffer_h0);
508 if (error) return 0;
509 return 1;
510 }
511
512 static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n)
513 {
514 cmph_uint32 i, min_index = 0;
515 for(i = 1; i < n; i++)
516 {
517 if(vector[i] < vector[min_index]) min_index = i;
518 }
519 return min_index;
520 }
521
522 static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys)
523 {
524 cmph_uint8 i;
525 for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;}
526 }
527
528 static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen)
529 {
530 cmph_uint32 i = 0;
531 cmph_uint32 buflenh1 = 0;
532 cmph_uint32 buflenh2 = 0;
533 char * bufh1 = NULL;
534 char * bufh2 = NULL;
535 char * buf = NULL;
536 cmph_uint32 n = fchf->b;//brz->size[index];
537 hash_state_dump(fchf->h1, &bufh1, &buflenh1);
538 hash_state_dump(fchf->h2, &bufh2, &buflenh2);
539 *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32);
540 buf = (char *)malloc((size_t)(*buflen));
541 memcpy(buf, &buflenh1, sizeof(cmph_uint32));
542 memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1);
543 memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32));
544 memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2);
545 for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), (size_t)1);
546 free(bufh1);
547 free(bufh2);
548 return buf;
549 }
550 static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen)
551 {
552 cmph_uint32 buflenh1 = 0;
553 cmph_uint32 buflenh2 = 0;
554 char * bufh1 = NULL;
555 char * bufh2 = NULL;
556 char * buf = NULL;
557 cmph_uint32 n = (cmph_uint32)ceil(brz->c * brz->size[index]);
558 hash_state_dump(bmzf->hashes[0], &bufh1, &buflenh1);
559 hash_state_dump(bmzf->hashes[1], &bufh2, &buflenh2);
560 *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32);
561 buf = (char *)malloc((size_t)(*buflen));
562 memcpy(buf, &buflenh1, sizeof(cmph_uint32));
563 memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1);
564 memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32));
565 memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2);
566 memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2,bmzf->g, (size_t)n);
567 free(bufh1);
568 free(bufh2);
569 return buf;
570 }
571
572
573 int brz_dump(cmph_t *mphf, FILE *fd)
574 {
575 brz_data_t *data = (brz_data_t *)mphf->data;
576 char *buf = NULL;
577 cmph_uint32 buflen;
578 register size_t nbytes;
579 DEBUGP("Dumping brzf\n");
580 // The initial part of the MPHF have already been dumped to disk during construction
581 // Dumping h0
582 hash_state_dump(data->h0, &buf, &buflen);
583 DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
584 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
585 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
586 free(buf);
587 // Dumping m and the vector offset.
588 nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
589 nbytes = fwrite(data->offset, sizeof(cmph_uint32)*(data->k), (size_t)1, fd);
590 if (nbytes == 0 && ferror(fd)) {
591 fprintf(stderr, "ERROR: %s\n", strerror(errno));
592 return 0;
593 }
594 return 1;
595 }
596
597 void brz_load(FILE *f, cmph_t *mphf)
598 {
599 char *buf = NULL;
600 cmph_uint32 buflen;
601 register size_t nbytes;
602 cmph_uint32 i, n;
603 brz_data_t *brz = (brz_data_t *)malloc(sizeof(brz_data_t));
604
605 DEBUGP("Loading brz mphf\n");
606 mphf->data = brz;
607 nbytes = fread(&(brz->c), sizeof(double), (size_t)1, f);
608 nbytes = fread(&(brz->algo), sizeof(brz->algo), (size_t)1, f); // Reading algo.
609 nbytes = fread(&(brz->k), sizeof(cmph_uint32), (size_t)1, f);
610 brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k);
611 nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f);
612 brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
613 brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
614 brz->g = (cmph_uint8 **) calloc((size_t)brz->k, sizeof(cmph_uint8 *));
615 DEBUGP("Reading c = %f k = %u algo = %u \n", brz->c, brz->k, brz->algo);
616 //loading h_i1, h_i2 and g_i.
617 for(i = 0; i < brz->k; i++)
618 {
619 // h1
620 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
621 DEBUGP("Hash state 1 has %u bytes\n", buflen);
622 buf = (char *)malloc((size_t)buflen);
623 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
624 brz->h1[i] = hash_state_load(buf, buflen);
625 free(buf);
626 //h2
627 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
628 DEBUGP("Hash state 2 has %u bytes\n", buflen);
629 buf = (char *)malloc((size_t)buflen);
630 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
631 brz->h2[i] = hash_state_load(buf, buflen);
632 free(buf);
633 switch(brz->algo)
634 {
635 case CMPH_FCH:
636 n = fch_calc_b(brz->c, brz->size[i]);
637 break;
638 case CMPH_BMZ8:
639 n = (cmph_uint32)ceil(brz->c * brz->size[i]);
640 break;
641 default: assert(0);
642 }
643 DEBUGP("g_i has %u bytes\n", n);
644 brz->g[i] = (cmph_uint8 *)calloc((size_t)n, sizeof(cmph_uint8));
645 nbytes = fread(brz->g[i], sizeof(cmph_uint8)*n, (size_t)1, f);
646 }
647 //loading h0
648 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
649 DEBUGP("Hash state has %u bytes\n", buflen);
650 buf = (char *)malloc((size_t)buflen);
651 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
652 brz->h0 = hash_state_load(buf, buflen);
653 free(buf);
654
655 //loading c, m, and the vector offset.
656 nbytes = fread(&(brz->m), sizeof(cmph_uint32), (size_t)1, f);
657 brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k);
658 nbytes = fread(brz->offset, sizeof(cmph_uint32)*(brz->k), (size_t)1, f);
659 if (nbytes == 0 && ferror(f)) {
660 fprintf(stderr, "ERROR: %s\n", strerror(errno));
661 }
662
663 return;
664 }
665
666 static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
667 {
668 register cmph_uint32 h0;
669 register cmph_uint32 m, n, h1, h2;
670 register cmph_uint8 mphf_bucket;
671
672 hash_vector(brz->h0, key, keylen, fingerprint);
673 h0 = fingerprint[2] % brz->k;
674
675 m = brz->size[h0];
676 n = (cmph_uint32)ceil(brz->c * m);
677 h1 = hash(brz->h1[h0], key, keylen) % n;
678 h2 = hash(brz->h2[h0], key, keylen) % n;
679
680 if (h1 == h2 && ++h2 >= n) h2 = 0;
681 mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]);
682 DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0);
683 DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m);
684 DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]);
685 return (mphf_bucket + brz->offset[h0]);
686 }
687
688 static cmph_uint32 brz_fch_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
689 {
690 register cmph_uint32 h0;
691 register cmph_uint32 m, b, h1, h2;
692 register double p1, p2;
693 register cmph_uint8 mphf_bucket;
694
695 hash_vector(brz->h0, key, keylen, fingerprint);
696 h0 = fingerprint[2] % brz->k;
697
698 m = brz->size[h0];
699 b = fch_calc_b(brz->c, m);
700 p1 = fch_calc_p1(m);
701 p2 = fch_calc_p2(b);
702 h1 = hash(brz->h1[h0], key, keylen) % m;
703 h2 = hash(brz->h2[h0], key, keylen) % m;
704 mphf_bucket = 0;
705 h1 = mixh10h11h12(b, p1, p2, h1);
706 mphf_bucket = (cmph_uint8)((h2 + brz->g[h0][h1]) % m);
707 return (mphf_bucket + brz->offset[h0]);
708 }
709
710 cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
711 {
712 brz_data_t *brz = mphf->data;
713 cmph_uint32 fingerprint[3];
714 switch(brz->algo)
715 {
716 case CMPH_FCH:
717 return brz_fch_search(brz, key, keylen, fingerprint);
718 case CMPH_BMZ8:
719 return brz_bmz8_search(brz, key, keylen, fingerprint);
720 default: assert(0);
721 }
722 return 0;
723 }
724 void brz_destroy(cmph_t *mphf)
725 {
726 cmph_uint32 i;
727 brz_data_t *data = (brz_data_t *)mphf->data;
728 if(data->g)
729 {
730 for(i = 0; i < data->k; i++)
731 {
732 free(data->g[i]);
733 hash_state_destroy(data->h1[i]);
734 hash_state_destroy(data->h2[i]);
735 }
736 free(data->g);
737 free(data->h1);
738 free(data->h2);
739 }
740 hash_state_destroy(data->h0);
741 free(data->size);
742 free(data->offset);
743 free(data);
744 free(mphf);
745 }
746
747 /** \fn void brz_pack(cmph_t *mphf, void *packed_mphf);
748 * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
749 * \param mphf pointer to the resulting mphf
750 * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
751 */
752 void brz_pack(cmph_t *mphf, void *packed_mphf)
753 {
754 brz_data_t *data = (brz_data_t *)mphf->data;
755 cmph_uint8 * ptr = packed_mphf;
756 cmph_uint32 i,n;
757 CMPH_HASH h0_type, h1_type, h2_type;
758 #ifdef __brz_use_64bit__
759 cmph_uint64 * g_is_ptr;
760 #else
761 cmph_uint32 * g_is_ptr;
762 #endif
763 cmph_uint8 * g_i;
764
765 // packing internal algo type
766 memcpy(ptr, &(data->algo), sizeof(data->algo));
767 ptr += sizeof(data->algo);
768
769 // packing h0 type
770 h0_type = hash_get_type(data->h0);
771 memcpy(ptr, &h0_type, sizeof(h0_type));
772 ptr += sizeof(h0_type);
773
774 // packing h0
775 hash_state_pack(data->h0, ptr);
776 ptr += hash_state_packed_size(h0_type);
777
778 // packing k
779 memcpy(ptr, &(data->k), sizeof(data->k));
780 ptr += sizeof(data->k);
781
782 // packing c
783 *((cmph_uint64 *)ptr) = (cmph_uint64)data->c;
784 ptr += sizeof(data->c);
785
786 // packing h1 type
787 h1_type = hash_get_type(data->h1[0]);
788 memcpy(ptr, &h1_type, sizeof(h1_type));
789 ptr += sizeof(h1_type);
790
791 // packing h2 type
792 h2_type = hash_get_type(data->h2[0]);
793 memcpy(ptr, &h2_type, sizeof(h2_type));
794 ptr += sizeof(h2_type);
795
796 // packing size
797 memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k);
798 ptr += data->k;
799
800 // packing offset
801 memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k);
802 ptr += sizeof(cmph_uint32)*data->k;
803
804 #ifdef __brz_use_64bit__
805 g_is_ptr = (cmph_uint64 *)ptr;
806 #else
807 g_is_ptr = (cmph_uint32 *)ptr;
808 #endif
809
810 g_i = (cmph_uint8 *) (g_is_ptr + data->k);
811
812 for(i = 0; i < data->k; i++)
813 {
814 #ifdef __brz_use_64bit__
815 *g_is_ptr++ = (cmph_uint64)g_i;
816 #else
817 *g_is_ptr++ = (cmph_uint32)g_i;
818 #endif
819 // packing h1[i]
820 hash_state_pack(data->h1[i], g_i);
821 g_i += hash_state_packed_size(h1_type);
822
823 // packing h2[i]
824 hash_state_pack(data->h2[i], g_i);
825 g_i += hash_state_packed_size(h2_type);
826
827 // packing g_i
828 switch(data->algo)
829 {
830 case CMPH_FCH:
831 n = fch_calc_b(data->c, data->size[i]);
832 break;
833 case CMPH_BMZ8:
834 n = (cmph_uint32)ceil(data->c * data->size[i]);
835 break;
836 default: assert(0);
837 }
838 memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n);
839 g_i += n;
840
841 }
842
843 }
844
845 /** \fn cmph_uint32 brz_packed_size(cmph_t *mphf);
846 * \brief Return the amount of space needed to pack mphf.
847 * \param mphf pointer to a mphf
848 * \return the size of the packed function or zero for failures
849 */
850 cmph_uint32 brz_packed_size(cmph_t *mphf)
851 {
852 cmph_uint32 i;
853 cmph_uint32 size = 0;
854 brz_data_t *data = (brz_data_t *)mphf->data;
855 CMPH_HASH h0_type = hash_get_type(data->h0);
856 CMPH_HASH h1_type = hash_get_type(data->h1[0]);
857 CMPH_HASH h2_type = hash_get_type(data->h2[0]);
858 cmph_uint32 n;
859 size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) +
860 sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k);
861 // pointers to g_is
862 #ifdef __brz_use_64bit__
863 size += (cmph_uint32) sizeof(cmph_uint64)*data->k;
864 #else
865 size += (cmph_uint32) sizeof(cmph_uint32)*data->k;
866 #endif
867
868 size += hash_state_packed_size(h1_type) * data->k;
869 size += hash_state_packed_size(h2_type) * data->k;
870
871 n = 0;
872 for(i = 0; i < data->k; i++)
873 {
874 switch(data->algo)
875 {
876 case CMPH_FCH:
877 n = fch_calc_b(data->c, data->size[i]);
878 break;
879 case CMPH_BMZ8:
880 n = (cmph_uint32)ceil(data->c * data->size[i]);
881 break;
882 default: assert(0);
883 }
884 size += n;
885 }
886 return size;
887 }
888
889
890
891 static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
892 {
893 register CMPH_HASH h0_type = *packed_mphf++;
894 register cmph_uint32 *h0_ptr = packed_mphf;
895 register cmph_uint32 k, h0, m, n, h1, h2;
896 register cmph_uint32 *offset;
897 register double c;
898 register CMPH_HASH h1_type, h2_type;
899 register cmph_uint8 * size;
900 #ifdef __brz_use_64bit__
901 register cmph_uint64 * g_is_ptr;
902 #else
903 register cmph_uint32 * g_is_ptr;
904 #endif
905 register cmph_uint8 *h1_ptr, *h2_ptr, *g;
906 register cmph_uint8 mphf_bucket;
907
908 packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type));
909
910 k = *packed_mphf++;
911
912 c = (double)(*((cmph_uint64*)packed_mphf));
913 packed_mphf += 2;
914
915 h1_type = *packed_mphf++;
916
917 h2_type = *packed_mphf++;
918
919 size = (cmph_uint8 *) packed_mphf;
920 packed_mphf = (cmph_uint32 *)(size + k);
921
922 offset = packed_mphf;
923 packed_mphf += k;
924
925
926 hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint);
927 h0 = fingerprint[2] % k;
928
929 m = size[h0];
930 n = (cmph_uint32)ceil(c * m);
931
932 #ifdef __brz_use_64bit__
933 g_is_ptr = (cmph_uint64 *)packed_mphf;
934 #else
935 g_is_ptr = packed_mphf;
936 #endif
937
938 h1_ptr = (cmph_uint8 *) g_is_ptr[h0];
939
940 h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
941
942 g = h2_ptr + hash_state_packed_size(h2_type);
943
944 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
945 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
946
947 if (h1 == h2 && ++h2 >= n) h2 = 0;
948 mphf_bucket = (cmph_uint8)(g[h1] + g[h2]);
949 DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0);
950 DEBUGP("Address: %u\n", mphf_bucket + offset[h0]);
951 return (mphf_bucket + offset[h0]);
952 }
953
954 static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
955 {
956 register CMPH_HASH h0_type = *packed_mphf++;
957
958 register cmph_uint32 *h0_ptr = packed_mphf;
959 register cmph_uint32 k, h0, m, b, h1, h2;
960 register double c, p1, p2;
961 register CMPH_HASH h1_type, h2_type;
962 register cmph_uint8 *size, *h1_ptr, *h2_ptr, *g;
963 register cmph_uint32 *offset;
964 #ifdef __brz_use_64bit__
965 register cmph_uint64 * g_is_ptr;
966 #else
967 register cmph_uint32 * g_is_ptr;
968 #endif
969 register cmph_uint8 mphf_bucket;
970
971 packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type));
972
973 k = *packed_mphf++;
974
975 c = (double)(*((cmph_uint64*)packed_mphf));
976 packed_mphf += 2;
977
978 h1_type = *packed_mphf++;
979
980 h2_type = *packed_mphf++;
981
982 size = (cmph_uint8 *) packed_mphf;
983 packed_mphf = (cmph_uint32 *)(size + k);
984
985 offset = packed_mphf;
986 packed_mphf += k;
987
988 hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint);
989 h0 = fingerprint[2] % k;
990
991 m = size[h0];
992 b = fch_calc_b(c, m);
993 p1 = fch_calc_p1(m);
994 p2 = fch_calc_p2(b);
995
996 #ifdef __brz_use_64bit__
997 g_is_ptr = (cmph_uint64 *)packed_mphf;
998 #else
999 g_is_ptr = packed_mphf;
1000 #endif
1001
1002 h1_ptr = (cmph_uint8 *) g_is_ptr[h0];
1003
1004 h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
1005
1006 g = h2_ptr + hash_state_packed_size(h2_type);
1007
1008 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m;
1009 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m;
1010
1011 mphf_bucket = 0;
1012 h1 = mixh10h11h12(b, p1, p2, h1);
1013 mphf_bucket = (cmph_uint8)((h2 + g[h1]) % m);
1014 return (mphf_bucket + offset[h0]);
1015 }
1016
1017 /** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
1018 * \brief Use the packed mphf to do a search.
1019 * \param packed_mphf pointer to the packed mphf
1020 * \param key key to be hashed
1021 * \param keylen key legth in bytes
1022 * \return The mphf value
1023 */
1024 cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
1025 {
1026 register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf;
1027 register CMPH_ALGO algo = *ptr++;
1028 cmph_uint32 fingerprint[3];
1029 switch(algo)
1030 {
1031 case CMPH_FCH:
1032 return brz_fch_search_packed(ptr, key, keylen, fingerprint);
1033 case CMPH_BMZ8:
1034 return brz_bmz8_search_packed(ptr, key, keylen, fingerprint);
1035 default:
1036 assert(0);
1037 return 0; /* To avoid warnings that value must be returned */
1038 }
1039 }
1040