(root)/
glib-2.79.0/
girepository/
cmph/
chd_ph.c
       1  #include<stdio.h>
       2  #include<stdlib.h>
       3  #include<string.h>
       4  #include<math.h>
       5  #include<time.h>
       6  #include<assert.h>
       7  #include<limits.h>
       8  #include<errno.h>
       9  
      10  #include "cmph_structs.h"
      11  #include "chd_structs_ph.h"
      12  #include "chd_ph.h"
      13  #include"miller_rabin.h"
      14  #include"bitbool.h"
      15  
      16  
      17  //#define DEBUG
      18  #include "debug.h"
      19  
      20  // NO_ELEMENT is equivalent to null pointer
      21  #ifndef NO_ELEMENT
      22  #define NO_ELEMENT UINT_MAX
      23  #endif
      24  
      25  // struct used to represent items at mapping, ordering and searching phases
      26  struct _chd_ph_item_t
      27  {
      28  	cmph_uint32 f;
      29  	cmph_uint32 h;
      30  };
      31  typedef struct _chd_ph_item_t chd_ph_item_t;
      32  
      33  // struct to represent the items at mapping phase only. 
      34  struct _chd_ph_map_item_t
      35  {
      36  	cmph_uint32 f;
      37  	cmph_uint32 h;
      38  	cmph_uint32 bucket_num;
      39  };
      40  typedef struct _chd_ph_map_item_t chd_ph_map_item_t;
      41  
      42  // struct to represent a bucket
      43  struct _chd_ph_bucket_t
      44  {
      45  	cmph_uint32 items_list; // offset
      46  	union
      47  	{
      48  		cmph_uint32 size;
      49  		cmph_uint32 bucket_id;
      50  	};
      51  };
      52  
      53  typedef struct _chd_ph_bucket_t chd_ph_bucket_t;
      54  
      55  struct _chd_ph_sorted_list_t
      56  {
      57  	cmph_uint32 buckets_list;
      58  	cmph_uint32 size;
      59  };
      60  
      61  typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t;
      62  
      63  
      64  static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets);
      65  static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets);
      66  static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets);
      67  
      68  chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets)
      69  {
      70      chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t));
      71      return buckets;
      72  }
      73  
      74  void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets)
      75  {
      76  	register cmph_uint32 i = 0;
      77  	assert(buckets);
      78  	for(i = 0; i < nbuckets; i++)
      79  		buckets[i].size = 0;
      80  }
      81  static cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items,
      82  				cmph_uint32 nbuckets,cmph_uint32 item_idx)
      83  {
      84  	register cmph_uint32 i = 0;
      85  	register chd_ph_item_t * tmp_item;
      86  	register chd_ph_map_item_t * tmp_map_item = map_items + item_idx;
      87  	register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num;
      88  	tmp_item = items + bucket->items_list;
      89  	
      90  	for(i = 0; i < bucket->size; i++)
      91  	{
      92  		if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h)
      93  		{
      94  			DEBUGP("Item not added\n");
      95  			return 0;
      96  		};
      97  		tmp_item++;
      98  	};
      99  	tmp_item->f = tmp_map_item->f;
     100  	tmp_item->h = tmp_map_item->h;
     101  	bucket->size++;
     102  	return 1;
     103  };
     104  void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)
     105  {
     106      free(buckets);
     107  }
     108  
     109  static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, 
     110  					cmph_uint32 *max_bucket_size);
     111  
     112  static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items,
     113  				cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size);
     114  
     115  static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
     116  	cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table);
     117  
     118  static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
     119  {
     120  	double r = _r, n = _n;
     121  	return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2);
     122  };
     123  
     124  /* computes the entropy of non empty buckets.*/
     125  static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes)
     126  {
     127  	register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32));
     128  	register cmph_uint32 i;
     129  	register double entropy = 0;
     130  
     131  	for(i = 0; i < n; i++)
     132  	{
     133  		probe_counts[disp_table[i]]++;
     134  	};
     135  	
     136  	for(i = 0; i < max_probes; i++)
     137  	{
     138  		if(probe_counts[i] > 0)
     139  			entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2);
     140  	};
     141  	free(probe_counts);
     142  	return entropy;
     143  };
     144  
     145  chd_ph_config_data_t *chd_ph_config_new(void)
     146  {
     147  	chd_ph_config_data_t *chd_ph;
     148  	chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t));
     149  	assert(chd_ph);
     150  	memset(chd_ph, 0, sizeof(chd_ph_config_data_t));
     151  	
     152  	chd_ph->hashfunc = CMPH_HASH_JENKINS;
     153  	chd_ph->cs = NULL;
     154  	chd_ph->nbuckets = 0;
     155  	chd_ph->n = 0;
     156  	chd_ph->hl = NULL;
     157  
     158  	chd_ph->m = 0;
     159  	chd_ph->use_h = 1;
     160  	chd_ph->keys_per_bin = 1;
     161  	chd_ph->keys_per_bucket = 4;
     162  	chd_ph->occup_table = 0;
     163  	
     164  	return chd_ph;
     165  }
     166  
     167  void chd_ph_config_destroy(cmph_config_t *mph)
     168  {
     169  	chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
     170  	DEBUGP("Destroying algorithm dependent data\n");
     171  	if(data->occup_table)
     172  	{
     173  		free(data->occup_table);
     174  		data->occup_table = NULL;
     175  	}
     176  	free(data);
     177  }
     178  
     179  
     180  void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
     181  {
     182  	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
     183  	CMPH_HASH *hashptr = hashfuncs;
     184  	cmph_uint32 i = 0;
     185  	while(*hashptr != CMPH_HASH_COUNT)
     186  	{
     187  		if (i >= 1) break; //chd_ph only uses one linear hash function
     188  		chd_ph->hashfunc = *hashptr;	
     189  		++i, ++hashptr;
     190  	}
     191  }
     192  
     193  
     194  void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
     195  {
     196  	chd_ph_config_data_t *chd_ph;
     197  	assert(mph);
     198  	chd_ph = (chd_ph_config_data_t *)mph->data;
     199  	if(keys_per_bucket < 1 || keys_per_bucket >= 15)
     200  	{
     201  	    keys_per_bucket = 4;
     202  	}
     203  	chd_ph->keys_per_bucket = keys_per_bucket;
     204  }
     205  
     206  
     207  void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
     208  {
     209  	chd_ph_config_data_t *chd_ph;
     210  	assert(mph);
     211  	chd_ph = (chd_ph_config_data_t *)mph->data;
     212  	if(keys_per_bin <= 1 || keys_per_bin >= 128)
     213  	{
     214  	    keys_per_bin = 1;
     215  	}
     216  	chd_ph->keys_per_bin = keys_per_bin;
     217  }
     218  
     219  cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size)
     220  {
     221  	register cmph_uint32 i = 0, g = 0;
     222  	cmph_uint32 hl[3];
     223  	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
     224  	char * key = NULL;
     225  	cmph_uint32 keylen = 0;
     226  	chd_ph_map_item_t * map_item;
     227  	chd_ph_map_item_t * map_items = malloc(chd_ph->m*sizeof(chd_ph_map_item_t));
     228  	register cmph_uint32 mapping_iterations = 1000;
     229  	*max_bucket_size = 0;
     230  	while(1)
     231  	{
     232  		mapping_iterations--;
     233  		if (chd_ph->hl) hash_state_destroy(chd_ph->hl);
     234  		chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); 
     235  
     236  		chd_ph_bucket_clean(buckets, chd_ph->nbuckets);
     237  
     238  		mph->key_source->rewind(mph->key_source->data);  
     239  
     240  		for(i = 0; i < chd_ph->m; i++)
     241  		{
     242  			mph->key_source->read(mph->key_source->data, &key, &keylen);		
     243  			hash_vector(chd_ph->hl, key, keylen, hl);
     244  			
     245  			map_item = (map_items + i);
     246  
     247  			g = hl[0] % chd_ph->nbuckets;
     248  			map_item->f = hl[1] % chd_ph->n;
     249  			map_item->h = hl[2] % (chd_ph->n - 1) + 1;
     250  			map_item->bucket_num=g;
     251  			mph->key_source->dispose(mph->key_source->data, key, keylen);		
     252  // 			if(buckets[g].size == (chd_ph->keys_per_bucket << 2))
     253  // 			{
     254  // 				DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2));
     255  // 				goto error;
     256  // 			}
     257  			buckets[g].size++;
     258  			if(buckets[g].size > *max_bucket_size)
     259  			{
     260  				  *max_bucket_size = buckets[g].size;
     261  			}
     262  		}
     263  		buckets[0].items_list = 0;
     264  		for(i = 1; i < chd_ph->nbuckets; i++)
     265  		{
     266  			buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size;
     267  			buckets[i - 1].size = 0;
     268  		};
     269  		buckets[i - 1].size = 0;
     270  		for(i = 0; i < chd_ph->m; i++)
     271  		{
     272  			map_item = (map_items + i);
     273  			if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i))
     274  				break;
     275  		}
     276  		if(i == chd_ph->m)
     277  		{
     278  			free(map_items);
     279  			return 1; // SUCCESS
     280  		}
     281  		
     282  		if(mapping_iterations == 0)
     283  		{
     284  		      goto error;
     285  		}
     286  	}
     287  error:
     288  	free(map_items);
     289  	hash_state_destroy(chd_ph->hl);
     290  	chd_ph->hl = NULL;
     291  	return 0; // FAILURE
     292  }
     293  
     294  chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items,
     295  	cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size)
     296  {
     297  	chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t));
     298  	
     299  	chd_ph_bucket_t * input_buckets = (*_buckets);
     300  	chd_ph_bucket_t * output_buckets;
     301  	chd_ph_item_t * input_items = (*_items);
     302  	chd_ph_item_t * output_items;
     303  	register cmph_uint32 i, j, bucket_size, position, position2;
     304  // 	cmph_uint32 non_empty_buckets;
     305  	DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size);
     306  	// Determine size of each list of buckets
     307  	for(i = 0; i < nbuckets; i++)
     308  	{
     309  		bucket_size = input_buckets[i].size;
     310  		if(bucket_size == 0)
     311  			continue;
     312  		sorted_lists[bucket_size].size++;
     313  	};
     314  	sorted_lists[1].buckets_list = 0;
     315  	// Determine final position of list of buckets into the contiguous array that will store all the buckets
     316  	for(i = 2; i <= max_bucket_size; i++)
     317  	{
     318  		sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size;
     319  		sorted_lists[i-1].size = 0;
     320  	};
     321  	sorted_lists[i-1].size = 0;
     322  	// Store the buckets in a new array which is sorted by bucket sizes
     323  	output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero
     324  //  	non_empty_buckets = nbuckets;
     325  	
     326  	for(i = 0; i < nbuckets; i++)
     327  	{
     328  		bucket_size = input_buckets[i].size;
     329  		if(bucket_size == 0)
     330  		{
     331  // 			non_empty_buckets--;
     332  			continue;
     333  		};
     334  		position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size;
     335  		output_buckets[position].bucket_id = i;
     336  		output_buckets[position].items_list = input_buckets[i].items_list;
     337  		sorted_lists[bucket_size].size++;
     338  	};
     339  /*	for(i = non_empty_buckets; i < nbuckets; i++)
     340  		output_buckets[i].size=0;*/
     341  	// Return the buckets sorted in new order and free the old buckets sorted in old order
     342  	free(input_buckets);
     343  	(*_buckets) = output_buckets;
     344  	
     345  	
     346  	// Store the items according to the new order of buckets.
     347  	output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t));
     348  	position = 0;
     349  	i = 0;
     350  	for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
     351  	{
     352  		for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++)
     353  		{
     354  			position2 = output_buckets[i].items_list;
     355  			output_buckets[i].items_list = position;
     356  			for(j = 0; j < bucket_size; j++)
     357  			{
     358  				output_items[position].f = input_items[position2].f;
     359  				output_items[position].h = input_items[position2].h;
     360  				position++;
     361  				position2++;
     362  			};
     363  		};
     364  	};
     365  	//Return the items sorted in new order and free the old items sorted in old order
     366  	free(input_items);
     367  	(*_items) = output_items;
     368  	return sorted_lists;
     369  };
     370  
     371  static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
     372  					    chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num,
     373  					    cmph_uint32 bucket_num, cmph_uint32 size)
     374  {
     375  	register cmph_uint32 i;
     376  	register chd_ph_item_t * item;
     377  	register cmph_uint32 position;
     378  
     379  	item = items + buckets[bucket_num].items_list;
     380  	// try place bucket with probe_num
     381  	if(chd_ph->keys_per_bin > 1)
     382  	{
     383  		for(i = 0; i < size; i++) // placement
     384  		{
     385  			position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
     386  			if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
     387  			{
     388  				break;
     389  			}
     390  			(chd_ph->occup_table[position])++;
     391  			item++;
     392  		};
     393  	} else
     394  	{
     395  		for(i = 0; i < size; i++) // placement
     396  		{
     397  			position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
     398  			if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position))
     399  			{
     400  				break;
     401  			}
     402  			SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
     403  			item++;
     404  		};
     405  	};
     406  	if(i != size) // Undo the placement
     407  	{
     408  		item = items + buckets[bucket_num].items_list;
     409  		if(chd_ph->keys_per_bin > 1)
     410  		{
     411  			while(1)
     412  			{
     413  				if(i == 0)
     414  				{
     415  					break;
     416  				}
     417  				position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
     418  				(chd_ph->occup_table[position])--;
     419  				item++;
     420  				i--;
     421  			};
     422  		} else
     423  		{
     424  			while(1)
     425  			{
     426  				if(i == 0)
     427  				{
     428  					break;
     429  				}
     430  				position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
     431  				UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
     432  				
     433  // 				([position/32]^=(1<<(position%32));
     434  				item++;
     435  				i--;
     436  			};
     437  		};
     438  		return 0;
     439  	} 	
     440  	return 1;
     441  };
     442  
     443  static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes, 
     444                                        cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size)
     445  				      
     446  {
     447  	register cmph_uint32 probe0_num, probe1_num, probe_num;
     448  	probe0_num = 0;
     449  	probe1_num = 0;
     450  	probe_num = 0;
     451  	
     452  	while(1)
     453  	{
     454  		if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size))
     455  		{
     456  			disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n;
     457  			return 1;
     458  		}
     459  		probe0_num++;
     460  		if(probe0_num >= chd_ph->n)
     461  		{
     462  			probe0_num -= chd_ph->n;
     463  			probe1_num++;
     464  		};
     465  		probe_num++;
     466  		if(probe_num >= max_probes || probe1_num >= chd_ph->n)
     467  		{
     468  			return 0;
     469  		};
     470  	};
     471  	return 0;
     472  };
     473  
     474  static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items,
     475  					cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, 
     476  					cmph_uint32 * disp_table)
     477  {
     478  	register cmph_uint32 i = 0;
     479  	register cmph_uint32 curr_bucket = 0;
     480  
     481  	for(i = max_bucket_size; i > 0; i--)
     482  	{
     483  		curr_bucket = sorted_lists[i].buckets_list;
     484  		while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list)
     485  		{
     486  			if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i))
     487  			{
     488  				return 0;
     489  			}
     490  			curr_bucket++;
     491  		};
     492  	};
     493  	return 1;
     494  };
     495  
     496  static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, 
     497  					cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, 
     498  					cmph_uint32 * disp_table)
     499  {
     500  	register cmph_uint32 i,j, non_placed_bucket;
     501  	register cmph_uint32 curr_bucket;
     502  	register cmph_uint32 probe_num, probe0_num, probe1_num;
     503  	cmph_uint32 sorted_list_size;
     504  #ifdef DEBUG
     505  	cmph_uint32 items_list;
     506  	cmph_uint32 bucket_id;
     507  #endif
     508  	DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n");
     509  	for(i = max_bucket_size; i > 0; i--)
     510  	{
     511  		probe_num = 0;
     512  		probe0_num = 0;
     513  		probe1_num = 0;
     514  		sorted_list_size = sorted_lists[i].size;
     515  		while(sorted_lists[i].size != 0)
     516  		{
     517  			curr_bucket = sorted_lists[i].buckets_list;
     518  			for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++)
     519  			{
     520  				// if bucket is successfully placed remove it from list
     521  				if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i))
     522  				{	
     523  					disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n;
     524  // 					DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
     525  				} 
     526  				else
     527  				{
     528  // 					DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket);
     529  #ifdef DEBUG
     530  					items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list;
     531  					bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id;
     532  #endif
     533  					buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list;
     534  					buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id;
     535  #ifdef DEBUG		
     536  					buckets[curr_bucket].items_list=items_list;
     537  					buckets[curr_bucket].bucket_id=bucket_id;
     538  #endif
     539  					non_placed_bucket++;
     540  				}
     541  				curr_bucket++;
     542  			};
     543  			sorted_lists[i].size = non_placed_bucket;
     544  			probe0_num++;
     545  			if(probe0_num >= chd_ph->n)
     546  			{
     547  				probe0_num -= chd_ph->n;
     548  				probe1_num++;
     549  			};
     550  			probe_num++;
     551  			if(probe_num >= max_probes || probe1_num >= chd_ph->n)
     552  			{
     553  				sorted_lists[i].size = sorted_list_size;
     554  				return 0;
     555  			};
     556  		};
     557  		sorted_lists[i].size = sorted_list_size;
     558  	};
     559  	return 1;
     560  };
     561  
     562  cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
     563  			    cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, 
     564  			    cmph_uint32 * disp_table)
     565  {
     566  	if(chd_ph->use_h)
     567  	{
     568  		return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
     569  	}
     570  	else
     571  	{
     572  		return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
     573  	}
     574  
     575  }
     576  
     577  static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items,
     578                                                    cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size)
     579  {
     580  	register cmph_uint32 bucket_size, i, j;
     581  	register cmph_uint32 position, probe0_num, probe1_num;
     582  	G_GNUC_UNUSED register cmph_uint32 m = 0;
     583  	register chd_ph_item_t * item;
     584  	if(chd_ph->keys_per_bin > 1)
     585  		memset(chd_ph->occup_table, 0, chd_ph->n);
     586  	else
     587  		memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
     588  		
     589  	for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
     590  		for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size +
     591  				sorted_lists[bucket_size].buckets_list; i++)
     592  		{
     593  			j = bucket_size;
     594  			item = items + buckets[i].items_list;
     595  			probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n;
     596  			probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n;
     597  			for(; j > 0; j--)
     598  			{
     599  #ifdef DEBUG
     600  				m++;
     601  #endif
     602  				position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
     603  				if(chd_ph->keys_per_bin > 1)
     604  				{
     605  					if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
     606  					{
     607  						return 0;
     608  					}
     609  					(chd_ph->occup_table[position])++;
     610  				} 
     611  				else
     612  				{
     613  					if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position))
     614  					{
     615  						return 0;
     616  					}
     617  					SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
     618  				};
     619  				item++;
     620  			};
     621  		};
     622  	DEBUGP("We were able to place m = %u keys\n", m);
     623  	return 1;
     624  };
     625  
     626  
     627  cmph_t *chd_ph_new(cmph_config_t *mph, double c)
     628  {
     629  	cmph_t *mphf = NULL;
     630  	chd_ph_data_t *chd_phf = NULL;
     631  	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
     632  	
     633  	register double load_factor = c;
     634  	register cmph_uint8 searching_success = 0;
     635  	register cmph_uint32 max_probes = 1 << 20; // default value for max_probes
     636  	register cmph_uint32 iterations = 100;
     637  	chd_ph_bucket_t * buckets = NULL;
     638  	chd_ph_item_t * items = NULL;
     639  	register cmph_uint8 failure = 0;
     640  	cmph_uint32 max_bucket_size = 0;
     641  	chd_ph_sorted_list_t * sorted_lists = NULL;
     642  	cmph_uint32 * disp_table = NULL;
     643  	register double space_lower_bound = 0;
     644  	#ifdef CMPH_TIMING
     645  	double construction_time_begin = 0.0;
     646  	double construction_time = 0.0;
     647  	ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
     648  	#endif
     649  
     650  
     651  	chd_ph->m = mph->key_source->nkeys;
     652  	DEBUGP("m = %u\n", chd_ph->m);
     653  	
     654  	chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
     655  	DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
     656  	
     657  	if(load_factor < 0.5 )
     658  	{
     659  		load_factor = 0.5;
     660  	}
     661  	
     662  	if(load_factor >= 0.99)
     663  	{
     664  		load_factor = 0.99;
     665  	}
     666  	
     667  	DEBUGP("load_factor = %.3f\n", load_factor);
     668  	
     669  	chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
     670  	
     671  	//Round the number of bins to the prime immediately above
     672  	if(chd_ph->n % 2 == 0) chd_ph->n++;
     673  	for(;;)
     674  	{
     675  		if(check_primality(chd_ph->n) == 1)
     676  			break;
     677  		chd_ph->n += 2; // just odd numbers can be primes for n > 2
     678  		
     679  	};
     680  	
     681  	DEBUGP("n = %u \n", chd_ph->n);
     682  	if(chd_ph->keys_per_bin == 1)
     683  	{
     684  		space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
     685  	}
     686  	
     687  	if(mph->verbosity)
     688  	{
     689  		fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
     690  	}
     691  
     692         	// We allocate the working tables
     693  	buckets = chd_ph_bucket_new(chd_ph->nbuckets); 
     694  	items   = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
     695  
     696  	max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
     697  	
     698  	if(chd_ph->keys_per_bin == 1)
     699  		chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));
     700  	else
     701  		chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
     702  		
     703  	disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
     704  // 	
     705  // 	init_genrand(time(0));
     706  	
     707  	while(1)
     708  	{
     709  		iterations --;
     710  		if (mph->verbosity)
     711  		{
     712  			fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
     713  		}
     714  		
     715  		if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
     716  		{
     717  			if (mph->verbosity)
     718  			{
     719  				fprintf(stderr, "Failure in mapping step\n");		
     720  			}
     721  			failure = 1;
     722  			goto cleanup;
     723  		}
     724  
     725  		if (mph->verbosity)
     726  		{
     727  			fprintf(stderr, "Starting ordering step\n");
     728  		}
     729  		if(sorted_lists)
     730  		{
     731  			free(sorted_lists);
     732  		}
     733  
     734          	sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size);
     735  		
     736  		if (mph->verbosity)
     737  		{
     738  			fprintf(stderr, "Starting searching step\n");
     739  		}
     740  		
     741  		searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
     742  		if(searching_success) break;
     743  		
     744  		// reset occup_table
     745  		if(chd_ph->keys_per_bin > 1)
     746  			memset(chd_ph->occup_table, 0, chd_ph->n);
     747  		else
     748  			memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
     749  		if(iterations == 0)
     750  		{
     751  			// Cleanup memory
     752  			if (mph->verbosity)
     753  			{
     754  				fprintf(stderr, "Failure because the max trials was exceeded\n");
     755  			}
     756  			failure = 1;
     757  			goto cleanup;
     758  		};
     759  	}
     760  
     761  	#ifdef DEBUG
     762  	{
     763  		if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size))
     764  		{
     765  		
     766  			DEBUGP("Error for bin packing generation");
     767  			failure = 1;
     768  			goto cleanup;
     769  		}
     770  	}
     771  	#endif
     772  	
     773  	if (mph->verbosity)
     774  	{
     775  		fprintf(stderr, "Starting compressing step\n");
     776  	}
     777  	
     778  	if(chd_ph->cs)
     779  	{
     780  		free(chd_ph->cs);
     781  	}
     782  	chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
     783  	compressed_seq_init(chd_ph->cs);
     784  	compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
     785  	
     786  	#ifdef CMPH_TIMING
     787  	ELAPSED_TIME_IN_SECONDS(&construction_time);
     788  	register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
     789  	DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
     790  	#endif
     791  
     792  cleanup:
     793  	chd_ph_bucket_destroy(buckets); 
     794  	free(items);
     795  	free(sorted_lists);
     796  	free(disp_table);
     797  	if(failure) 
     798  	{
     799  		if(chd_ph->hl)
     800  		{
     801  			hash_state_destroy(chd_ph->hl);
     802  		}
     803  		chd_ph->hl = NULL;
     804  		return NULL;
     805  	}
     806  
     807  	mphf = (cmph_t *)malloc(sizeof(cmph_t));
     808  	mphf->algo = mph->algo;
     809  	chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
     810  	
     811  	chd_phf->cs = chd_ph->cs;
     812  	chd_ph->cs = NULL; //transfer memory ownership
     813  	chd_phf->hl = chd_ph->hl;
     814  	chd_ph->hl = NULL; //transfer memory ownership
     815  	chd_phf->n = chd_ph->n;
     816  	chd_phf->nbuckets = chd_ph->nbuckets;
     817  	
     818  	mphf->data = chd_phf;
     819  	mphf->size = chd_ph->n;
     820  
     821  	DEBUGP("Successfully generated minimal perfect hash\n");
     822  	if (mph->verbosity)
     823  	{
     824  		fprintf(stderr, "Successfully generated minimal perfect hash function\n");
     825  	}
     826  	
     827  	#ifdef CMPH_TIMING	
     828  	register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
     829  	construction_time = construction_time - construction_time_begin;
     830  	fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
     831  	#endif	
     832  
     833  	return mphf;
     834  }
     835  
     836  
     837  
     838  void chd_ph_load(FILE *fd, cmph_t *mphf)
     839  {
     840  	char *buf = NULL;
     841  	cmph_uint32 buflen;
     842  	register size_t nbytes;
     843  	chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
     844  
     845  	DEBUGP("Loading chd_ph mphf\n");
     846  	mphf->data = chd_ph;
     847  
     848  	nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
     849  	DEBUGP("Hash state has %u bytes\n", buflen);
     850  	buf = (char *)malloc((size_t)buflen);
     851  	nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
     852  	chd_ph->hl = hash_state_load(buf, buflen);
     853  	free(buf);
     854  	
     855  	nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
     856  	DEBUGP("Compressed sequence structure has %u bytes\n", buflen);
     857  	buf = (char *)malloc((size_t)buflen);
     858  	nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
     859  	chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); 
     860  	compressed_seq_load(chd_ph->cs, buf, buflen);
     861  	free(buf);
     862  	
     863  	// loading n and nbuckets
     864  	DEBUGP("Reading n and nbuckets\n");
     865  	nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd);	
     866  	nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);	
     867  	if (nbytes == 0 && ferror(fd)) {
     868            fprintf(stderr, "ERROR: %s\n", strerror(errno));
     869          }
     870  
     871  }
     872  
     873  int chd_ph_dump(cmph_t *mphf, FILE *fd)
     874  {
     875  	char *buf = NULL;
     876  	cmph_uint32 buflen;
     877  	register size_t nbytes;
     878  	chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
     879  	
     880  	__cmph_dump(mphf, fd);
     881  
     882  	hash_state_dump(data->hl, &buf, &buflen);
     883  	DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
     884  	nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
     885  	nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
     886  	free(buf);
     887  
     888  	compressed_seq_dump(data->cs, &buf, &buflen);
     889  	DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen);
     890  	nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
     891  	nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
     892  	free(buf);
     893  
     894  	// dumping n and nbuckets
     895  	nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
     896  	nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
     897  	if (nbytes == 0 && ferror(fd)) {
     898            fprintf(stderr, "ERROR: %s\n", strerror(errno));
     899            return 0;
     900          }
     901  	return 1;
     902  }
     903  
     904  void chd_ph_destroy(cmph_t *mphf)
     905  {
     906  	chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
     907  	compressed_seq_destroy(data->cs);
     908  	free(data->cs);
     909  	hash_state_destroy(data->hl);
     910  	free(data);
     911  	free(mphf);
     912  
     913  }
     914  
     915  cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
     916  {
     917  	register chd_ph_data_t * chd_ph = mphf->data;
     918  	cmph_uint32 hl[3];
     919  	register cmph_uint32 disp,position;
     920  	register cmph_uint32 probe0_num,probe1_num;
     921  	register cmph_uint32 f,g,h;
     922  	hash_vector(chd_ph->hl, key, keylen, hl);	
     923  	g = hl[0] % chd_ph->nbuckets;
     924  	f = hl[1] % chd_ph->n;
     925  	h = hl[2] % (chd_ph->n-1) + 1;
     926  	
     927  	disp = compressed_seq_query(chd_ph->cs, g);
     928  	probe0_num = disp % chd_ph->n;
     929  	probe1_num = disp/chd_ph->n;
     930  	position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n);
     931  	return position;
     932  }
     933  
     934  void chd_ph_pack(cmph_t *mphf, void *packed_mphf)
     935  {
     936  	chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
     937  	cmph_uint8 * ptr = packed_mphf;
     938  
     939  	// packing hl type
     940  	CMPH_HASH hl_type = hash_get_type(data->hl);
     941  	*((cmph_uint32 *) ptr) = hl_type;
     942  	ptr += sizeof(cmph_uint32);
     943  
     944  	// packing hl
     945  	hash_state_pack(data->hl, ptr);
     946  	ptr += hash_state_packed_size(hl_type);
     947  
     948  	// packing n
     949  	*((cmph_uint32 *) ptr) = data->n;
     950  	ptr += sizeof(data->n);
     951  
     952  	// packing nbuckets
     953  	*((cmph_uint32 *) ptr) = data->nbuckets;
     954  	ptr += sizeof(data->nbuckets);
     955  
     956  	// packing cs
     957  	compressed_seq_pack(data->cs, ptr);
     958  	//ptr += compressed_seq_packed_size(data->cs);
     959  
     960  }
     961  
     962  cmph_uint32 chd_ph_packed_size(cmph_t *mphf)
     963  {
     964  	register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
     965  	register CMPH_HASH hl_type = hash_get_type(data->hl); 
     966  	register cmph_uint32 hash_state_pack_size =  hash_state_packed_size(hl_type);
     967  	register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs);
     968  	
     969  	return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32));
     970  
     971  }
     972  
     973  cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
     974  {
     975  	register CMPH_HASH hl_type  = *(cmph_uint32 *)packed_mphf;
     976  	register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
     977  	
     978  	register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type));
     979  	register cmph_uint32 n = *ptr++;
     980  	register cmph_uint32 nbuckets = *ptr++;
     981  	cmph_uint32 hl[3];
     982  		
     983  	register cmph_uint32 disp,position;
     984  	register cmph_uint32 probe0_num,probe1_num;
     985  	register cmph_uint32 f,g,h;
     986  	
     987  	hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
     988  
     989  	g = hl[0] % nbuckets;
     990  	f = hl[1] % n;
     991  	h = hl[2] % (n-1) + 1;
     992  	
     993  	disp = compressed_seq_query_packed(ptr, g);
     994  	probe0_num = disp % n;
     995  	probe1_num = disp/n;
     996  	position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n);
     997  	return position;
     998  }
     999  
    1000  
    1001