(root)/
binutils-2.41/
intl/
localcharset.c
       1  /* Determine a canonical name for the current locale's character encoding.
       2  
       3     Copyright (C) 2000-2003 Free Software Foundation, Inc.
       4  
       5     This program is free software; you can redistribute it and/or modify it
       6     under the terms of the GNU Library General Public License as published
       7     by the Free Software Foundation; either version 2, or (at your option)
       8     any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Library General Public License for more details.
      14  
      15     You should have received a copy of the GNU Library General Public
      16     License along with this program; if not, write to the Free Software
      17     Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301,
      18     USA.  */
      19  
      20  /* Written by Bruno Haible <bruno@clisp.org>.  */
      21  
      22  #ifdef HAVE_CONFIG_H
      23  # include <config.h>
      24  #endif
      25  
      26  /* Specification.  */
      27  #include "localcharset.h"
      28  
      29  #if HAVE_STDDEF_H
      30  # include <stddef.h>
      31  #endif
      32  
      33  #include <stdio.h>
      34  #if HAVE_STRING_H
      35  # include <string.h>
      36  #else
      37  # include <strings.h>
      38  #endif
      39  #if HAVE_STDLIB_H
      40  # include <stdlib.h>
      41  #endif
      42  
      43  #if defined _WIN32 || defined __WIN32__
      44  # undef WIN32   /* avoid warning on mingw32 */
      45  # define WIN32
      46  #endif
      47  
      48  #if defined __EMX__
      49  /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
      50  # define OS2
      51  #endif
      52  
      53  #if !defined WIN32
      54  # if HAVE_LANGINFO_CODESET
      55  #  include <langinfo.h>
      56  # else
      57  #  if HAVE_SETLOCALE
      58  #   include <locale.h>
      59  #  endif
      60  # endif
      61  #elif defined WIN32
      62  # define WIN32_LEAN_AND_MEAN
      63  # include <windows.h>
      64  #endif
      65  #if defined OS2
      66  # define INCL_DOS
      67  # include <os2.h>
      68  #endif
      69  
      70  #if ENABLE_RELOCATABLE
      71  # include "relocatable.h"
      72  #else
      73  # define relocate(pathname) (pathname)
      74  #endif
      75  
      76  #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
      77    /* Win32, OS/2, DOS */
      78  # define ISSLASH(C) ((C) == '/' || (C) == '\\')
      79  #endif
      80  
      81  #ifndef DIRECTORY_SEPARATOR
      82  # define DIRECTORY_SEPARATOR '/'
      83  #endif
      84  
      85  #ifndef ISSLASH
      86  # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
      87  #endif
      88  
      89  #ifdef HAVE_GETC_UNLOCKED
      90  # undef getc
      91  # define getc getc_unlocked
      92  #endif
      93  
      94  /* The following static variable is declared 'volatile' to avoid a
      95     possible multithread problem in the function get_charset_aliases. If we
      96     are running in a threaded environment, and if two threads initialize
      97     'charset_aliases' simultaneously, both will produce the same value,
      98     and everything will be ok if the two assignments to 'charset_aliases'
      99     are atomic. But I don't know what will happen if the two assignments mix.  */
     100  #if __STDC__ != 1
     101  # define volatile /* empty */
     102  #endif
     103  /* Pointer to the contents of the charset.alias file, if it has already been
     104     read, else NULL.  Its format is:
     105     ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
     106  static const char * volatile charset_aliases;
     107  
     108  /* Return a pointer to the contents of the charset.alias file.  */
     109  static const char *
     110  get_charset_aliases ()
     111  {
     112    const char *cp;
     113  
     114    cp = charset_aliases;
     115    if (cp == NULL)
     116      {
     117  #if !(defined VMS || defined WIN32)
     118        FILE *fp;
     119        const char *dir = relocate (LIBDIR);
     120        const char *base = "charset.alias";
     121        char *file_name;
     122  
     123        /* Concatenate dir and base into freshly allocated file_name.  */
     124        {
     125  	size_t dir_len = strlen (dir);
     126  	size_t base_len = strlen (base);
     127  	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
     128  	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
     129  	if (file_name != NULL)
     130  	  {
     131  	    memcpy (file_name, dir, dir_len);
     132  	    if (add_slash)
     133  	      file_name[dir_len] = DIRECTORY_SEPARATOR;
     134  	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
     135  	  }
     136        }
     137  
     138        if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
     139  	/* Out of memory or file not found, treat it as empty.  */
     140  	cp = "";
     141        else
     142  	{
     143  	  /* Parse the file's contents.  */
     144  	  int c;
     145  	  char buf1[50+1];
     146  	  char buf2[50+1];
     147  	  char *res_ptr = NULL;
     148  	  size_t res_size = 0;
     149  	  size_t l1, l2;
     150  
     151  	  for (;;)
     152  	    {
     153  	      c = getc (fp);
     154  	      if (c == EOF)
     155  		break;
     156  	      if (c == '\n' || c == ' ' || c == '\t')
     157  		continue;
     158  	      if (c == '#')
     159  		{
     160  		  /* Skip comment, to end of line.  */
     161  		  do
     162  		    c = getc (fp);
     163  		  while (!(c == EOF || c == '\n'));
     164  		  if (c == EOF)
     165  		    break;
     166  		  continue;
     167  		}
     168  	      ungetc (c, fp);
     169  	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
     170  		break;
     171  	      l1 = strlen (buf1);
     172  	      l2 = strlen (buf2);
     173  	      if (res_size == 0)
     174  		{
     175  		  res_size = l1 + 1 + l2 + 1;
     176  		  res_ptr = (char *) malloc (res_size + 1);
     177  		}
     178  	      else
     179  		{
     180  		  res_size += l1 + 1 + l2 + 1;
     181  		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
     182  		}
     183  	      if (res_ptr == NULL)
     184  		{
     185  		  /* Out of memory. */
     186  		  res_size = 0;
     187  		  break;
     188  		}
     189  	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
     190  	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
     191  	    }
     192  	  fclose (fp);
     193  	  if (res_size == 0)
     194  	    cp = "";
     195  	  else
     196  	    {
     197  	      *(res_ptr + res_size) = '\0';
     198  	      cp = res_ptr;
     199  	    }
     200  	}
     201  
     202        if (file_name != NULL)
     203  	free (file_name);
     204  
     205  #else
     206  
     207  # if defined VMS
     208        /* To avoid the troubles of an extra file charset.alias_vms in the
     209  	 sources of many GNU packages, simply inline the aliases here.  */
     210        /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
     211  	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
     212  	 section 10.7 "Handling Different Character Sets".  */
     213        cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
     214  	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
     215  	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
     216  	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
     217  	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
     218  	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
     219  	   /* Japanese */
     220  	   "eucJP" "\0" "EUC-JP" "\0"
     221  	   "SJIS" "\0" "SHIFT_JIS" "\0"
     222  	   "DECKANJI" "\0" "DEC-KANJI" "\0"
     223  	   "SDECKANJI" "\0" "EUC-JP" "\0"
     224  	   /* Chinese */
     225  	   "eucTW" "\0" "EUC-TW" "\0"
     226  	   "DECHANYU" "\0" "DEC-HANYU" "\0"
     227  	   "DECHANZI" "\0" "GB2312" "\0"
     228  	   /* Korean */
     229  	   "DECKOREAN" "\0" "EUC-KR" "\0";
     230  # endif
     231  
     232  # if defined WIN32
     233        /* To avoid the troubles of installing a separate file in the same
     234  	 directory as the DLL and of retrieving the DLL's directory at
     235  	 runtime, simply inline the aliases here.  */
     236  
     237        cp = "CP936" "\0" "GBK" "\0"
     238  	   "CP1361" "\0" "JOHAB" "\0"
     239  	   "CP20127" "\0" "ASCII" "\0"
     240  	   "CP20866" "\0" "KOI8-R" "\0"
     241  	   "CP21866" "\0" "KOI8-RU" "\0"
     242  	   "CP28591" "\0" "ISO-8859-1" "\0"
     243  	   "CP28592" "\0" "ISO-8859-2" "\0"
     244  	   "CP28593" "\0" "ISO-8859-3" "\0"
     245  	   "CP28594" "\0" "ISO-8859-4" "\0"
     246  	   "CP28595" "\0" "ISO-8859-5" "\0"
     247  	   "CP28596" "\0" "ISO-8859-6" "\0"
     248  	   "CP28597" "\0" "ISO-8859-7" "\0"
     249  	   "CP28598" "\0" "ISO-8859-8" "\0"
     250  	   "CP28599" "\0" "ISO-8859-9" "\0"
     251  	   "CP28605" "\0" "ISO-8859-15" "\0";
     252  # endif
     253  #endif
     254  
     255        charset_aliases = cp;
     256      }
     257  
     258    return cp;
     259  }
     260  
     261  /* Determine the current locale's character encoding, and canonicalize it
     262     into one of the canonical names listed in config.charset.
     263     The result must not be freed; it is statically allocated.
     264     If the canonical name cannot be determined, the result is a non-canonical
     265     name.  */
     266  
     267  #ifdef STATIC
     268  STATIC
     269  #endif
     270  const char *
     271  locale_charset ()
     272  {
     273    const char *codeset;
     274    const char *aliases;
     275  
     276  #if !(defined WIN32 || defined OS2)
     277  
     278  # if HAVE_LANGINFO_CODESET
     279  
     280    /* Most systems support nl_langinfo (CODESET) nowadays.  */
     281    codeset = nl_langinfo (CODESET);
     282  
     283  # else
     284  
     285    /* On old systems which lack it, use setlocale or getenv.  */
     286    const char *locale = NULL;
     287  
     288    /* But most old systems don't have a complete set of locales.  Some
     289       (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
     290       use setlocale here; it would return "C" when it doesn't support the
     291       locale name the user has set.  */
     292  #  if HAVE_SETLOCALE && 0
     293    locale = setlocale (LC_CTYPE, NULL);
     294  #  endif
     295    if (locale == NULL || locale[0] == '\0')
     296      {
     297        locale = getenv ("LC_ALL");
     298        if (locale == NULL || locale[0] == '\0')
     299  	{
     300  	  locale = getenv ("LC_CTYPE");
     301  	  if (locale == NULL || locale[0] == '\0')
     302  	    locale = getenv ("LANG");
     303  	}
     304      }
     305  
     306    /* On some old systems, one used to set locale = "iso8859_1". On others,
     307       you set it to "language_COUNTRY.charset". In any case, we resolve it
     308       through the charset.alias file.  */
     309    codeset = locale;
     310  
     311  # endif
     312  
     313  #elif defined WIN32
     314  
     315    static char buf[2 + 10 + 1];
     316  
     317    /* Woe32 has a function returning the locale's codepage as a number.  */
     318    sprintf (buf, "CP%u", GetACP ());
     319    codeset = buf;
     320  
     321  #elif defined OS2
     322  
     323    const char *locale;
     324    static char buf[2 + 10 + 1];
     325    ULONG cp[3];
     326    ULONG cplen;
     327  
     328    /* Allow user to override the codeset, as set in the operating system,
     329       with standard language environment variables.  */
     330    locale = getenv ("LC_ALL");
     331    if (locale == NULL || locale[0] == '\0')
     332      {
     333        locale = getenv ("LC_CTYPE");
     334        if (locale == NULL || locale[0] == '\0')
     335  	locale = getenv ("LANG");
     336      }
     337    if (locale != NULL && locale[0] != '\0')
     338      {
     339        /* If the locale name contains an encoding after the dot, return it.  */
     340        const char *dot = strchr (locale, '.');
     341  
     342        if (dot != NULL)
     343  	{
     344  	  const char *modifier;
     345  
     346  	  dot++;
     347  	  /* Look for the possible @... trailer and remove it, if any.  */
     348  	  modifier = strchr (dot, '@');
     349  	  if (modifier == NULL)
     350  	    return dot;
     351  	  if (modifier - dot < sizeof (buf))
     352  	    {
     353  	      memcpy (buf, dot, modifier - dot);
     354  	      buf [modifier - dot] = '\0';
     355  	      return buf;
     356  	    }
     357  	}
     358  
     359        /* Resolve through the charset.alias file.  */
     360        codeset = locale;
     361      }
     362    else
     363      {
     364        /* OS/2 has a function returning the locale's codepage as a number.  */
     365        if (DosQueryCp (sizeof (cp), cp, &cplen))
     366  	codeset = "";
     367        else
     368  	{
     369  	  sprintf (buf, "CP%u", cp[0]);
     370  	  codeset = buf;
     371  	}
     372      }
     373  
     374  #endif
     375  
     376    if (codeset == NULL)
     377      /* The canonical name cannot be determined.  */
     378      codeset = "";
     379  
     380    /* Resolve alias. */
     381    for (aliases = get_charset_aliases ();
     382         *aliases != '\0';
     383         aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
     384      if (strcmp (codeset, aliases) == 0
     385  	|| (aliases[0] == '*' && aliases[1] == '\0'))
     386        {
     387  	codeset = aliases + strlen (aliases) + 1;
     388  	break;
     389        }
     390  
     391    /* Don't return an empty string.  GNU libc and GNU libiconv interpret
     392       the empty string as denoting "the locale's character encoding",
     393       thus GNU libiconv would call this function a second time.  */
     394    if (codeset[0] == '\0')
     395      codeset = "ASCII";
     396  
     397    return codeset;
     398  }