1 #include "graph.h"
2 #include "chm.h"
3 #include "cmph_structs.h"
4 #include "chm_structs.h"
5 #include "hash.h"
6 #include "bitbool.h"
7
8 #include <math.h>
9 #include <stdlib.h>
10 #include <stdio.h>
11 #include <assert.h>
12 #include <string.h>
13 #include <errno.h>
14
15 //#define DEBUG
16 #include "debug.h"
17
18 static int chm_gen_edges(cmph_config_t *mph);
19 static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v);
20
21 chm_config_data_t *chm_config_new(void)
22 {
23 chm_config_data_t *chm = NULL;
24 chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t));
25 assert(chm);
26 memset(chm, 0, sizeof(chm_config_data_t));
27 chm->hashfuncs[0] = CMPH_HASH_JENKINS;
28 chm->hashfuncs[1] = CMPH_HASH_JENKINS;
29 chm->g = NULL;
30 chm->graph = NULL;
31 chm->hashes = NULL;
32 return chm;
33 }
34 void chm_config_destroy(cmph_config_t *mph)
35 {
36 chm_config_data_t *data = (chm_config_data_t *)mph->data;
37 DEBUGP("Destroying algorithm dependent data\n");
38 free(data);
39 }
40
41 void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
42 {
43 chm_config_data_t *chm = (chm_config_data_t *)mph->data;
44 CMPH_HASH *hashptr = hashfuncs;
45 cmph_uint32 i = 0;
46 while(*hashptr != CMPH_HASH_COUNT)
47 {
48 if (i >= 2) break; //chm only uses two hash functions
49 chm->hashfuncs[i] = *hashptr;
50 ++i, ++hashptr;
51 }
52 }
53
54 cmph_t *chm_new(cmph_config_t *mph, double c)
55 {
56 cmph_t *mphf = NULL;
57 chm_data_t *chmf = NULL;
58
59 cmph_uint32 i;
60 cmph_uint32 iterations = 20;
61 cmph_uint8 *visited = NULL;
62 chm_config_data_t *chm = (chm_config_data_t *)mph->data;
63 chm->m = mph->key_source->nkeys;
64 if (c == 0) c = 2.09;
65 chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys);
66 DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c);
67 chm->graph = graph_new(chm->n, chm->m);
68 DEBUGP("Created graph\n");
69
70 chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
71 for(i = 0; i < 3; ++i) chm->hashes[i] = NULL;
72 //Mapping step
73 if (mph->verbosity)
74 {
75 fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", chm->m, chm->n);
76 }
77 while(1)
78 {
79 int ok;
80 chm->hashes[0] = hash_state_new(chm->hashfuncs[0], chm->n);
81 chm->hashes[1] = hash_state_new(chm->hashfuncs[1], chm->n);
82 ok = chm_gen_edges(mph);
83 if (!ok)
84 {
85 --iterations;
86 hash_state_destroy(chm->hashes[0]);
87 chm->hashes[0] = NULL;
88 hash_state_destroy(chm->hashes[1]);
89 chm->hashes[1] = NULL;
90 DEBUGP("%u iterations remaining\n", iterations);
91 if (mph->verbosity)
92 {
93 fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations);
94 }
95 if (iterations == 0) break;
96 }
97 else break;
98 }
99 if (iterations == 0)
100 {
101 graph_destroy(chm->graph);
102 return NULL;
103 }
104
105 //Assignment step
106 if (mph->verbosity)
107 {
108 fprintf(stderr, "Starting assignment step\n");
109 }
110 DEBUGP("Assignment step\n");
111 visited = (cmph_uint8 *)malloc((size_t)(chm->n/8 + 1));
112 memset(visited, 0, (size_t)(chm->n/8 + 1));
113 free(chm->g);
114 chm->g = (cmph_uint32 *)malloc(chm->n * sizeof(cmph_uint32));
115 assert(chm->g);
116 for (i = 0; i < chm->n; ++i)
117 {
118 if (!GETBIT(visited,i))
119 {
120 chm->g[i] = 0;
121 chm_traverse(chm, visited, i);
122 }
123 }
124 graph_destroy(chm->graph);
125 free(visited);
126 chm->graph = NULL;
127
128 mphf = (cmph_t *)malloc(sizeof(cmph_t));
129 mphf->algo = mph->algo;
130 chmf = (chm_data_t *)malloc(sizeof(chm_data_t));
131 chmf->g = chm->g;
132 chm->g = NULL; //transfer memory ownership
133 chmf->hashes = chm->hashes;
134 chm->hashes = NULL; //transfer memory ownership
135 chmf->n = chm->n;
136 chmf->m = chm->m;
137 mphf->data = chmf;
138 mphf->size = chm->m;
139 DEBUGP("Successfully generated minimal perfect hash\n");
140 if (mph->verbosity)
141 {
142 fprintf(stderr, "Successfully generated minimal perfect hash function\n");
143 }
144 return mphf;
145 }
146
147 static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v)
148 {
149
150 graph_iterator_t it = graph_neighbors_it(chm->graph, v);
151 cmph_uint32 neighbor = 0;
152 SETBIT(visited,v);
153
154 DEBUGP("Visiting vertex %u\n", v);
155 while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR)
156 {
157 DEBUGP("Visiting neighbor %u\n", neighbor);
158 if(GETBIT(visited,neighbor)) continue;
159 DEBUGP("Visiting neighbor %u\n", neighbor);
160 DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(chm->graph, v, neighbor));
161 chm->g[neighbor] = graph_edge_id(chm->graph, v, neighbor) - chm->g[v];
162 DEBUGP("g is %u (%u - %u mod %u)\n", chm->g[neighbor], graph_edge_id(chm->graph, v, neighbor), chm->g[v], chm->m);
163 chm_traverse(chm, visited, neighbor);
164 }
165 }
166
167 static int chm_gen_edges(cmph_config_t *mph)
168 {
169 cmph_uint32 e;
170 chm_config_data_t *chm = (chm_config_data_t *)mph->data;
171 int cycles = 0;
172
173 DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]);
174 graph_clear_edges(chm->graph);
175 mph->key_source->rewind(mph->key_source->data);
176 for (e = 0; e < mph->key_source->nkeys; ++e)
177 {
178 cmph_uint32 h1, h2;
179 cmph_uint32 keylen;
180 char *key;
181 mph->key_source->read(mph->key_source->data, &key, &keylen);
182 h1 = hash(chm->hashes[0], key, keylen) % chm->n;
183 h2 = hash(chm->hashes[1], key, keylen) % chm->n;
184 if (h1 == h2) if (++h2 >= chm->n) h2 = 0;
185 if (h1 == h2)
186 {
187 if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e);
188 mph->key_source->dispose(mph->key_source->data, key, keylen);
189 return 0;
190 }
191 DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
192 mph->key_source->dispose(mph->key_source->data, key, keylen);
193 graph_add_edge(chm->graph, h1, h2);
194 }
195 cycles = graph_is_cyclic(chm->graph);
196 if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n");
197 DEBUGP("Looking for cycles: %u\n", cycles);
198
199 return ! cycles;
200 }
201
202 int chm_dump(cmph_t *mphf, FILE *fd)
203 {
204 char *buf = NULL;
205 cmph_uint32 buflen;
206 cmph_uint32 two = 2; //number of hash functions
207 chm_data_t *data = (chm_data_t *)mphf->data;
208 register size_t nbytes;
209
210 __cmph_dump(mphf, fd);
211
212 nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd);
213 hash_state_dump(data->hashes[0], &buf, &buflen);
214 DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
215 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
216 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
217 free(buf);
218
219 hash_state_dump(data->hashes[1], &buf, &buflen);
220 DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
221 nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
222 nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
223 free(buf);
224
225 nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
226 nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
227
228 nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd);
229 if (nbytes == 0 && ferror(fd)) {
230 fprintf(stderr, "ERROR: %s\n", strerror(errno));
231 return 0;
232 }
233 /* #ifdef DEBUG
234 fprintf(stderr, "G: ");
235 for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
236 fprintf(stderr, "\n");
237 #endif*/
238 return 1;
239 }
240
241 void chm_load(FILE *f, cmph_t *mphf)
242 {
243 cmph_uint32 nhashes;
244 char *buf = NULL;
245 cmph_uint32 buflen;
246 cmph_uint32 i;
247 chm_data_t *chm = (chm_data_t *)malloc(sizeof(chm_data_t));
248 register size_t nbytes;
249 DEBUGP("Loading chm mphf\n");
250 mphf->data = chm;
251 nbytes = fread(&nhashes, sizeof(cmph_uint32), (size_t)1, f);
252 chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
253 chm->hashes[nhashes] = NULL;
254 DEBUGP("Reading %u hashes\n", nhashes);
255 for (i = 0; i < nhashes; ++i)
256 {
257 hash_state_t *state = NULL;
258 nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
259 DEBUGP("Hash state has %u bytes\n", buflen);
260 buf = (char *)malloc((size_t)buflen);
261 nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
262 state = hash_state_load(buf, buflen);
263 chm->hashes[i] = state;
264 free(buf);
265 }
266
267 DEBUGP("Reading m and n\n");
268 nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f);
269 nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f);
270
271 chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n);
272 nbytes = fread(chm->g, chm->n*sizeof(cmph_uint32), (size_t)1, f);
273 if (nbytes == 0 && ferror(f)) {
274 fprintf(stderr, "ERROR: %s\n", strerror(errno));
275 return;
276 }
277 #ifdef DEBUG
278 fprintf(stderr, "G: ");
279 for (i = 0; i < chm->n; ++i) fprintf(stderr, "%u ", chm->g[i]);
280 fprintf(stderr, "\n");
281 #endif
282 return;
283 }
284
285
286 cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
287 {
288 chm_data_t *chm = mphf->data;
289 cmph_uint32 h1 = hash(chm->hashes[0], key, keylen) % chm->n;
290 cmph_uint32 h2 = hash(chm->hashes[1], key, keylen) % chm->n;
291 DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
292 if (h1 == h2 && ++h2 >= chm->n) h2 = 0;
293 DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, chm->g[h1], chm->g[h2], chm->m);
294 return (chm->g[h1] + chm->g[h2]) % chm->m;
295 }
296 void chm_destroy(cmph_t *mphf)
297 {
298 chm_data_t *data = (chm_data_t *)mphf->data;
299 free(data->g);
300 hash_state_destroy(data->hashes[0]);
301 hash_state_destroy(data->hashes[1]);
302 free(data->hashes);
303 free(data);
304 free(mphf);
305 }
306
307 /** \fn void chm_pack(cmph_t *mphf, void *packed_mphf);
308 * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
309 * \param mphf pointer to the resulting mphf
310 * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
311 */
312 void chm_pack(cmph_t *mphf, void *packed_mphf)
313 {
314 chm_data_t *data = (chm_data_t *)mphf->data;
315 cmph_uint8 * ptr = packed_mphf;
316 CMPH_HASH h2_type;
317
318 // packing h1 type
319 CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
320 *((cmph_uint32 *) ptr) = h1_type;
321 ptr += sizeof(cmph_uint32);
322
323 // packing h1
324 hash_state_pack(data->hashes[0], ptr);
325 ptr += hash_state_packed_size(h1_type);
326
327 // packing h2 type
328 h2_type = hash_get_type(data->hashes[1]);
329 *((cmph_uint32 *) ptr) = h2_type;
330 ptr += sizeof(cmph_uint32);
331
332 // packing h2
333 hash_state_pack(data->hashes[1], ptr);
334 ptr += hash_state_packed_size(h2_type);
335
336 // packing n
337 *((cmph_uint32 *) ptr) = data->n;
338 ptr += sizeof(data->n);
339
340 // packing m
341 *((cmph_uint32 *) ptr) = data->m;
342 ptr += sizeof(data->m);
343
344 // packing g
345 memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n);
346 }
347
348 /** \fn cmph_uint32 chm_packed_size(cmph_t *mphf);
349 * \brief Return the amount of space needed to pack mphf.
350 * \param mphf pointer to a mphf
351 * \return the size of the packed function or zero for failures
352 */
353 cmph_uint32 chm_packed_size(cmph_t *mphf)
354 {
355 chm_data_t *data = (chm_data_t *)mphf->data;
356 CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
357 CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
358
359 return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
360 4*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n);
361 }
362
363 /** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
364 * \brief Use the packed mphf to do a search.
365 * \param packed_mphf pointer to the packed mphf
366 * \param key key to be hashed
367 * \param keylen key legth in bytes
368 * \return The mphf value
369 */
370 cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
371 {
372 register cmph_uint8 *h1_ptr = packed_mphf;
373 register CMPH_HASH h1_type = *((cmph_uint32 *)h1_ptr);
374 register cmph_uint8 *h2_ptr;
375 register CMPH_HASH h2_type;
376 register cmph_uint32 *g_ptr;
377 register cmph_uint32 n, m, h1, h2;
378
379 h1_ptr += 4;
380
381 h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
382 h2_type = *((cmph_uint32 *)h2_ptr);
383 h2_ptr += 4;
384
385 g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type));
386
387 n = *g_ptr++;
388 m = *g_ptr++;
389
390 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
391 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
392 DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
393 if (h1 == h2 && ++h2 >= n) h2 = 0;
394 DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m);
395 return (g_ptr[h1] + g_ptr[h2]) % m;
396 }