(root)/
glib-2.79.0/
girepository/
cmph/
main.c
       1  #ifdef WIN32
       2  #include "wingetopt.h"
       3  #else
       4  #include <getopt.h>
       5  #endif
       6  #include <stdio.h>
       7  #include <stdlib.h>
       8  #include <errno.h>
       9  #include <string.h>
      10  #include <time.h>
      11  #include <limits.h>
      12  #include <assert.h>
      13  #include "cmph.h"
      14  #include "hash.h"
      15  
      16  #ifdef WIN32
      17  #define VERSION "0.8"
      18  #else
      19  #include "config.h"
      20  #endif
      21  
      22  
      23  void usage(const char *prg)
      24  {
      25  	fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph]  keysfile\n", prg);   
      26  }
      27  void usage_long(const char *prg)
      28  {
      29  	cmph_uint32 i;
      30  	fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);   
      31  	fprintf(stderr, "Minimum perfect hashing tool\n\n"); 
      32  	fprintf(stderr, "  -h\t print this help message\n");
      33  	fprintf(stderr, "  -c\t c value determines:\n");
      34  	fprintf(stderr, "    \t  * the number of vertices in the graph for the algorithms BMZ and CHM\n");
      35  	fprintf(stderr, "    \t  * the number of bits per key required in the FCH algorithm\n");
      36  	fprintf(stderr, "    \t  * the load factor in the CHD_PH algorithm\n");
      37  	fprintf(stderr, "  -a\t algorithm - valid values are\n");
      38  	for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, "    \t  * %s\n", cmph_names[i]);
      39  	fprintf(stderr, "  -f\t hash function (may be used multiple times) - valid values are\n");
      40  	for (i = 0; i < CMPH_HASH_COUNT; ++i) fprintf(stderr, "    \t  * %s\n", cmph_hash_names[i]);
      41  	fprintf(stderr, "  -V\t print version number and exit\n");
      42  	fprintf(stderr, "  -v\t increase verbosity (may be used multiple times)\n");
      43  	fprintf(stderr, "  -k\t number of keys\n");
      44  	fprintf(stderr, "  -g\t generation mode\n");
      45  	fprintf(stderr, "  -s\t random seed\n");
      46  	fprintf(stderr, "  -m\t minimum perfect hash function file \n");
      47  	fprintf(stderr, "  -M\t main memory availability (in MB) used in BRZ algorithm \n");
      48  	fprintf(stderr, "  -d\t temporary directory used in BRZ algorithm \n");
      49  	fprintf(stderr, "  -b\t the meaning of this parameter depends on the algorithm selected in the -a option:\n");
      50  	fprintf(stderr, "    \t  * For BRZ it is used to make the maximal number of keys in a bucket lower than 256.\n");
      51  	fprintf(stderr, "    \t    In this case its value should be an integer in the range [64,175]. Default is 128.\n\n");
      52  	fprintf(stderr, "    \t  * For BDZ it is used to determine the size of some precomputed rank\n");
      53  	fprintf(stderr, "    \t    information and its value should be an integer in the range [3,10]. Default\n");
      54  	fprintf(stderr, "    \t    is 7. The larger is this value, the more compact are the resulting functions\n");
      55  	fprintf(stderr, "    \t    and the slower are them at evaluation time.\n\n");
      56  	fprintf(stderr, "    \t  * For CHD and CHD_PH it is used to set the average number of keys per bucket\n");
      57  	fprintf(stderr, "    \t    and its value should be an integer in the range [1,32]. Default is 4. The\n");
      58  	fprintf(stderr, "    \t    larger is this value, the slower is the construction of the functions.\n");
      59  	fprintf(stderr, "    \t    This parameter has no effect for other algorithms.\n\n");
      60  	fprintf(stderr, "  -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n");	
      61  	fprintf(stderr, "    \t hash function allows at most t collisions in a given bin. This parameter applies\n");
      62  	fprintf(stderr, "    \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n");
      63  	fprintf(stderr, "    \t range [1,128]. Defaul is 1\n");
      64  	fprintf(stderr, "  keysfile\t line separated file with keys\n");
      65  }
      66  
      67  int main(int argc, char **argv)
      68  {
      69  	cmph_uint32 verbosity = 0;
      70  	char generate = 0;
      71  	char *mphf_file = NULL;
      72  	FILE *mphf_fd = stdout;
      73  	const char *keys_file = NULL;
      74  	FILE *keys_fd;
      75  	cmph_uint32 nkeys = UINT_MAX;
      76  	cmph_uint32 seed = UINT_MAX;
      77  	CMPH_HASH *hashes = NULL;
      78  	cmph_uint32 nhashes = 0;
      79  	cmph_uint32 i;
      80  	CMPH_ALGO mph_algo = CMPH_CHM;
      81  	double c = 0;
      82  	cmph_config_t *config = NULL;
      83  	cmph_t *mphf = NULL;
      84  	char * tmp_dir = NULL;
      85  	cmph_io_adapter_t *source;
      86  	cmph_uint32 memory_availability = 0;
      87  	cmph_uint32 b = 0;
      88  	cmph_uint32 keys_per_bin = 1;
      89  	while (1)
      90  	{
      91  		char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
      92  		if (ch == -1) break;
      93  		switch (ch)
      94  		{
      95  			case 's':
      96  				{
      97  					char *cptr;
      98  					seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
      99  					if(*cptr != 0) {
     100  						fprintf(stderr, "Invalid seed %s\n", optarg);
     101  						exit(1);
     102  					}
     103  				}
     104  				break;
     105  			case 'c':
     106  				{
     107  					char *endptr;
     108  					c = strtod(optarg, &endptr);
     109  					if(*endptr != 0) {
     110  						fprintf(stderr, "Invalid c value %s\n", optarg);
     111  						exit(1);
     112  					}
     113  				}
     114  				break;
     115  			case 'g':
     116  				generate = 1;
     117  				break;
     118  			case 'k':
     119  			        {
     120  					char *endptr;
     121  					nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
     122  					if(*endptr != 0) {
     123  						fprintf(stderr, "Invalid number of keys %s\n", optarg);
     124  						exit(1);
     125  					}
     126  				}
     127  				break;
     128  			case 'm':
     129  				mphf_file = strdup(optarg);
     130  				break;
     131  			case 'd':
     132  				tmp_dir = strdup(optarg);
     133  				break;
     134  			case 'M':
     135  				{
     136  					char *cptr;
     137  					memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
     138  					if(*cptr != 0) {
     139  						fprintf(stderr, "Invalid memory availability %s\n", optarg);
     140  						exit(1);
     141  					}
     142  				}
     143  				break;
     144  			case 'b':
     145  				{
     146  					char *cptr;
     147  					b =  (cmph_uint32)strtoul(optarg, &cptr, 10);
     148  					if(*cptr != 0) {
     149  						fprintf(stderr, "Parameter b was not found: %s\n", optarg);
     150  						exit(1);
     151  					}
     152  				}
     153  				break;
     154  			case 't':
     155  				{
     156  					char *cptr;
     157  					keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
     158  					if(*cptr != 0) {
     159  						fprintf(stderr, "Parameter t was not found: %s\n", optarg);
     160  						exit(1);
     161  					}
     162  				}
     163  				break;
     164  			case 'v':
     165  				++verbosity;
     166  				break;
     167  			case 'V':
     168  				printf("%s\n", VERSION);
     169  				return 0;
     170  			case 'h':
     171  				usage_long(argv[0]);
     172  				return 0;
     173  			case 'a':
     174  				{
     175  				char valid = 0;
     176  				for (i = 0; i < CMPH_COUNT; ++i)
     177  				{
     178  					if (strcmp(cmph_names[i], optarg) == 0)
     179  					{
     180  						mph_algo = i;
     181  						valid = 1;
     182  						break;
     183  					}
     184  				}
     185  				if (!valid) 
     186  				{
     187  					fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
     188  					return -1;
     189  				}
     190  				}
     191  				break;
     192  			case 'f':
     193  				{
     194  				char valid = 0;
     195  				for (i = 0; i < CMPH_HASH_COUNT; ++i)
     196  				{
     197  					if (strcmp(cmph_hash_names[i], optarg) == 0)
     198  					{
     199  						hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
     200  						hashes[nhashes] = i;
     201  						hashes[nhashes + 1] = CMPH_HASH_COUNT;
     202  						++nhashes;
     203  						valid = 1;
     204  						break;
     205  					}
     206  				}
     207  				if (!valid) 
     208  				{
     209  					fprintf(stderr, "Invalid hash function: %s\n", optarg);
     210  					return -1;
     211  				}
     212  				}
     213  				break;
     214  			default:
     215  				usage(argv[0]);
     216  				return 1;
     217  		}
     218  	}
     219  
     220  	if (optind != argc - 1)
     221  	{
     222  		usage(argv[0]);
     223  		return 1;
     224  	}
     225  	keys_file = argv[optind];
     226    
     227  	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
     228  	srand(seed);
     229  	int ret = 0;
     230  	if (mphf_file == NULL)
     231  	{
     232  		mphf_file = (char *)malloc(strlen(keys_file) + 5);
     233  		memcpy(mphf_file, keys_file, strlen(keys_file));
     234  		memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
     235  	}	
     236  
     237  	keys_fd = fopen(keys_file, "r");
     238  
     239  	if (keys_fd == NULL)
     240  	{
     241  		fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
     242  		return -1;
     243  	}
     244  
     245  	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
     246  	if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
     247  	else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
     248  	if (generate)
     249  	{
     250  		//Create mphf
     251  		mphf_fd = fopen(mphf_file, "w");
     252  		config = cmph_config_new(source);
     253  		cmph_config_set_algo(config, mph_algo);
     254  		if (nhashes) cmph_config_set_hashfuncs(config, hashes);
     255  		cmph_config_set_verbosity(config, verbosity);
     256  		cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
     257  		cmph_config_set_mphf_fd(config, mphf_fd);
     258  		cmph_config_set_memory_availability(config, memory_availability);
     259  		cmph_config_set_b(config, b);
     260  		cmph_config_set_keys_per_bin(config, keys_per_bin);
     261  		
     262  		//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
     263  		if(mph_algo == CMPH_BMZ  && c >= 2.0) c=1.15;
     264  		if (c != 0) cmph_config_set_graphsize(config, c);
     265  		mphf = cmph_new(config);
     266  
     267  		cmph_config_destroy(config);
     268  		if (mphf == NULL)
     269  		{
     270  			fprintf(stderr, "Unable to create minimum perfect hashing function\n");
     271  			//cmph_config_destroy(config);
     272  			free(mphf_file);
     273  			return -1;
     274  		}
     275  
     276  		if (mphf_fd == NULL)
     277  		{
     278  			fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
     279  			free(mphf_file);
     280  			return -1;
     281  		}
     282  		cmph_dump(mphf, mphf_fd); 
     283  		cmph_destroy(mphf);	
     284  		fclose(mphf_fd);
     285  	}
     286  	else
     287  	{
     288  		cmph_uint8 * hashtable = NULL;
     289  		mphf_fd = fopen(mphf_file, "r");
     290  		if (mphf_fd == NULL)
     291  		{
     292  			fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
     293  			free(mphf_file);
     294  			return -1;
     295  		}
     296  		mphf = cmph_load(mphf_fd);
     297  		fclose(mphf_fd);
     298  		if (!mphf)
     299  		{
     300  			fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
     301  			free(mphf_file);
     302  			return -1;
     303  		}
     304  		cmph_uint32 siz = cmph_size(mphf);
     305  		hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
     306  		memset(hashtable, 0,(size_t) siz);
     307  		//check all keys
     308  		for (i = 0; i < source->nkeys; ++i)
     309  		{
     310  			cmph_uint32 h;
     311  			char *buf;
     312  			cmph_uint32 buflen = 0;
     313  			source->read(source->data, &buf, &buflen);
     314  			h = cmph_search(mphf, buf, buflen);
     315  			if (!(h < siz))
     316  			{
     317  				fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
     318  				ret = 1;
     319  			} else if(hashtable[h] >= keys_per_bin)
     320  			{
     321  				fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
     322  				fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
     323  				ret = 1;
     324  			} else hashtable[h]++;
     325  
     326  			if (verbosity)
     327  			{
     328  				printf("%s -> %u\n", buf, h);
     329  			}
     330  			source->dispose(source->data, buf, buflen);
     331  		}
     332  		
     333  		cmph_destroy(mphf);
     334  		free(hashtable);
     335  	}
     336  	fclose(keys_fd);
     337  	free(mphf_file);
     338  	free(tmp_dir);
     339          cmph_io_nlfile_adapter_destroy(source);
     340  	return ret;
     341    
     342  }