(root)/
glib-2.79.0/
glib/
libcharset/
localcharset.c
       1  /* Determine a canonical name for the current locale's character encoding.
       2  
       3     Copyright (C) 2000-2006 Free Software Foundation, Inc.
       4  
       5     This program is free software; you can redistribute it and/or modify it
       6     under the terms of the GNU Library General Public License as published
       7     by the Free Software Foundation; either version 2, or (at your option)
       8     any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Library General Public License for more details.
      14  
      15     You should have received a copy of the GNU Library General Public
      16     License along with this program; if not, write to the Free Software
      17     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
      18     USA.  */
      19  
      20  /* Written by Bruno Haible <bruno@clisp.org>.  */
      21  
      22  #include "config.h"
      23  
      24  /* Specification.  */
      25  #include "localcharset.h"
      26  
      27  #include <stddef.h>
      28  #include <stdio.h>
      29  #include <string.h>
      30  #include <stdlib.h>
      31  
      32  #if defined _WIN32 || defined __WIN32__
      33  # define WIN32_NATIVE
      34  #endif
      35  
      36  #if defined __EMX__
      37  /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
      38  # define OS2
      39  #endif
      40  
      41  #if !defined WIN32_NATIVE
      42  # if HAVE_LANGINFO_CODESET
      43  #  include <langinfo.h>
      44  # else
      45  #  if 0 /* see comment below */
      46  #   include <locale.h>
      47  #  endif
      48  # endif
      49  # ifdef __CYGWIN__
      50  #  define WIN32_LEAN_AND_MEAN
      51  #  include <windows.h>
      52  # endif
      53  #elif defined WIN32_NATIVE
      54  # define WIN32_LEAN_AND_MEAN
      55  # include <windows.h>
      56  #endif
      57  #if defined OS2
      58  # define INCL_DOS
      59  # include <os2.h>
      60  #endif
      61  
      62  #if ENABLE_RELOCATABLE
      63  # include "relocatable.h"
      64  #else
      65  # define relocate(pathname) (pathname)
      66  #endif
      67  
      68  /* Get GLIB_CHARSETALIAS_DIR.  */
      69  #ifndef GLIB_CHARSETALIAS_DIR
      70  # define GLIB_CHARSETALIAS_DIR LIBDIR
      71  #endif
      72  
      73  #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
      74    /* Win32, Cygwin, OS/2, DOS */
      75  # define ISSLASH(C) ((C) == '/' || (C) == '\\')
      76  #endif
      77  
      78  #ifndef DIRECTORY_SEPARATOR
      79  # define DIRECTORY_SEPARATOR '/'
      80  #endif
      81  
      82  #ifndef ISSLASH
      83  # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
      84  #endif
      85  
      86  #if HAVE_DECL_GETC_UNLOCKED
      87  # undef getc
      88  # define getc getc_unlocked
      89  #endif
      90  
      91  /* The following static variable is declared 'volatile' to avoid a
      92     possible multithread problem in the function get_charset_aliases. If we
      93     are running in a threaded environment, and if two threads initialize
      94     'charset_aliases' simultaneously, both will produce the same value,
      95     and everything will be ok if the two assignments to 'charset_aliases'
      96     are atomic. But I don't know what will happen if the two assignments mix.  */
      97  #if __STDC__ != 1
      98  # define volatile /* empty */
      99  #endif
     100  /* Pointer to the contents of the charset.alias file, if it has already been
     101     read, else NULL.  Its format is:
     102     ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
     103  static const char * volatile charset_aliases;
     104  
     105  /* Return a pointer to the contents of the charset.alias file.  */
     106  const char *
     107  _g_locale_get_charset_aliases (void)
     108  {
     109    const char *cp;
     110  
     111    cp = charset_aliases;
     112    if (cp == NULL)
     113      {
     114  #if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
     115        FILE *fp;
     116        const char *dir;
     117        const char *base = "charset.alias";
     118        char *file_name;
     119  
     120        dir = relocate (GLIB_CHARSETALIAS_DIR);
     121  
     122        /* Concatenate dir and base into freshly allocated file_name.  */
     123        {
     124  	size_t dir_len = strlen (dir);
     125  	size_t base_len = strlen (base);
     126  	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
     127  	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
     128  	if (file_name != NULL)
     129  	  {
     130  	    memcpy (file_name, dir, dir_len);
     131  	    if (add_slash)
     132  	      file_name[dir_len] = DIRECTORY_SEPARATOR;
     133  	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
     134  	  }
     135        }
     136  
     137        if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
     138  	/* Out of memory or file not found, treat it as empty.  */
     139  	cp = "";
     140        else
     141  	{
     142  	  /* Parse the file's contents.  */
     143  	  char *res_ptr = NULL;
     144  	  size_t res_size = 0;
     145  
     146  	  for (;;)
     147  	    {
     148  	      int c;
     149  	      char buf1[50+1];
     150  	      char buf2[50+1];
     151  	      size_t l1, l2;
     152  	      char *old_res_ptr;
     153  
     154  	      c = getc (fp);
     155  	      if (c == EOF)
     156  		break;
     157  	      if (c == '\n' || c == ' ' || c == '\t')
     158  		continue;
     159  	      if (c == '#')
     160  		{
     161  		  /* Skip comment, to end of line.  */
     162  		  do
     163  		    c = getc (fp);
     164  		  while (!(c == EOF || c == '\n'));
     165  		  if (c == EOF)
     166  		    break;
     167  		  continue;
     168  		}
     169  	      ungetc (c, fp);
     170  	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
     171  		break;
     172  	      l1 = strlen (buf1);
     173  	      l2 = strlen (buf2);
     174  	      old_res_ptr = res_ptr;
     175  	      if (res_size == 0)
     176  		{
     177  		  res_size = l1 + 1 + l2 + 1;
     178  		  res_ptr = (char *) malloc (res_size + 1);
     179  		}
     180  	      else
     181  		{
     182  		  res_size += l1 + 1 + l2 + 1;
     183  		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
     184  		}
     185  	      if (res_ptr == NULL)
     186  		{
     187  		  /* Out of memory. */
     188  		  res_size = 0;
     189  		  if (old_res_ptr != NULL)
     190  		    free (old_res_ptr);
     191  		  break;
     192  		}
     193  	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
     194  	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
     195  	    }
     196  	  fclose (fp);
     197  	  if (res_size == 0)
     198  	    cp = "";
     199  	  else
     200  	    {
     201  	      *(res_ptr + res_size) = '\0';
     202  	      cp = res_ptr;
     203  	    }
     204  	}
     205  
     206        if (file_name != NULL)
     207  	free (file_name);
     208  
     209  #else
     210  
     211  # if defined VMS
     212        /* To avoid the troubles of an extra file charset.alias_vms in the
     213  	 sources of many GNU packages, simply inline the aliases here.  */
     214        /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
     215  	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
     216  	 section 10.7 "Handling Different Character Sets".  */
     217        cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
     218  	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
     219  	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
     220  	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
     221  	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
     222  	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
     223  	   /* Japanese */
     224  	   "eucJP" "\0" "EUC-JP" "\0"
     225  	   "SJIS" "\0" "SHIFT_JIS" "\0"
     226  	   "DECKANJI" "\0" "DEC-KANJI" "\0"
     227  	   "SDECKANJI" "\0" "EUC-JP" "\0"
     228  	   /* Chinese */
     229  	   "eucTW" "\0" "EUC-TW" "\0"
     230  	   "DECHANYU" "\0" "DEC-HANYU" "\0"
     231  	   "DECHANZI" "\0" "GB2312" "\0"
     232  	   /* Korean */
     233  	   "DECKOREAN" "\0" "EUC-KR" "\0";
     234  # endif
     235  
     236  # if defined WIN32_NATIVE || defined __CYGWIN__
     237        /* To avoid the troubles of installing a separate file in the same
     238  	 directory as the DLL and of retrieving the DLL's directory at
     239  	 runtime, simply inline the aliases here.  */
     240  
     241        cp = "CP936" "\0" "GBK" "\0"
     242  	   "CP1361" "\0" "JOHAB" "\0"
     243  	   "CP20127" "\0" "ASCII" "\0"
     244  	   "CP20866" "\0" "KOI8-R" "\0"
     245  	   "CP20936" "\0" "GB2312" "\0"
     246  	   "CP21866" "\0" "KOI8-RU" "\0"
     247  	   "CP28591" "\0" "ISO-8859-1" "\0"
     248  	   "CP28592" "\0" "ISO-8859-2" "\0"
     249  	   "CP28593" "\0" "ISO-8859-3" "\0"
     250  	   "CP28594" "\0" "ISO-8859-4" "\0"
     251  	   "CP28595" "\0" "ISO-8859-5" "\0"
     252  	   "CP28596" "\0" "ISO-8859-6" "\0"
     253  	   "CP28597" "\0" "ISO-8859-7" "\0"
     254  	   "CP28598" "\0" "ISO-8859-8" "\0"
     255  	   "CP28599" "\0" "ISO-8859-9" "\0"
     256  	   "CP28605" "\0" "ISO-8859-15" "\0"
     257  	   "CP38598" "\0" "ISO-8859-8" "\0"
     258  	   "CP51932" "\0" "EUC-JP" "\0"
     259  	   "CP51936" "\0" "GB2312" "\0"
     260  	   "CP51949" "\0" "EUC-KR" "\0"
     261  	   "CP51950" "\0" "EUC-TW" "\0"
     262  	   "CP54936" "\0" "GB18030" "\0"
     263  	   "CP65001" "\0" "UTF-8" "\0";
     264  # endif
     265  #endif
     266  
     267        charset_aliases = cp;
     268      }
     269  
     270    return cp;
     271  }
     272  
     273  /* Determine the current locale's character encoding, and canonicalize it
     274     into one of the canonical names listed in config.charset.
     275     The result must not be freed; it is statically allocated.
     276     If the canonical name cannot be determined, the result is a non-canonical
     277     name.  */
     278  
     279  const char *
     280  _g_locale_charset_raw (void)
     281  {
     282    const char *codeset;
     283  
     284  #if !(defined WIN32_NATIVE || defined OS2)
     285  
     286  # if HAVE_LANGINFO_CODESET
     287  
     288    /* Most systems support nl_langinfo (CODESET) nowadays.  */
     289    codeset = nl_langinfo (CODESET);
     290  
     291  #  ifdef __CYGWIN__
     292    /* Cygwin 2006 does not have locales.  nl_langinfo (CODESET) always
     293       returns "US-ASCII".  As long as this is not fixed, return the suffix
     294       of the locale name from the environment variables (if present) or
     295       the codepage as a number.  */
     296    if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
     297      {
     298        const char *locale;
     299        static char buf[2 + 10 + 1];
     300  
     301        locale = getenv ("LC_ALL");
     302        if (locale == NULL || locale[0] == '\0')
     303  	{
     304  	  locale = getenv ("LC_CTYPE");
     305  	  if (locale == NULL || locale[0] == '\0')
     306  	    locale = getenv ("LANG");
     307  	}
     308        if (locale != NULL && locale[0] != '\0')
     309  	{
     310  	  /* If the locale name contains an encoding after the dot, return
     311  	     it.  */
     312  	  const char *dot = strchr (locale, '.');
     313  
     314  	  if (dot != NULL)
     315  	    {
     316  	      const char *modifier;
     317  
     318  	      dot++;
     319  	      /* Look for the possible @... trailer and remove it, if any.  */
     320  	      modifier = strchr (dot, '@');
     321  	      if (modifier == NULL)
     322  		return dot;
     323  	      if (modifier - dot < sizeof (buf))
     324  		{
     325  		  memcpy (buf, dot, modifier - dot);
     326  		  buf [modifier - dot] = '\0';
     327  		  return buf;
     328  		}
     329  	    }
     330  	}
     331  
     332        /* Woe32 has a function returning the locale's codepage as a number.  */
     333        sprintf (buf, "CP%u", GetACP ());
     334        codeset = buf;
     335      }
     336  #  endif
     337  
     338  # else
     339  
     340    /* On old systems which lack it, use setlocale or getenv.  */
     341    const char *locale = NULL;
     342  
     343    /* But most old systems don't have a complete set of locales.  Some
     344       (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
     345       use setlocale here; it would return "C" when it doesn't support the
     346       locale name the user has set.  */
     347  #  if 0
     348    locale = setlocale (LC_CTYPE, NULL);
     349  #  endif
     350    if (locale == NULL || locale[0] == '\0')
     351      {
     352        locale = getenv ("LC_ALL");
     353        if (locale == NULL || locale[0] == '\0')
     354  	{
     355  	  locale = getenv ("LC_CTYPE");
     356  	  if (locale == NULL || locale[0] == '\0')
     357  	    locale = getenv ("LANG");
     358  	}
     359      }
     360  
     361    /* On some old systems, one used to set locale = "iso8859_1". On others,
     362       you set it to "language_COUNTRY.charset". In any case, we resolve it
     363       through the charset.alias file.  */
     364    codeset = locale;
     365  
     366  # endif
     367  
     368  #elif defined WIN32_NATIVE
     369  
     370    static char buf[2 + 10 + 1];
     371  
     372    /* Woe32 has a function returning the locale's codepage as a number.  */
     373    sprintf (buf, "CP%u", GetACP ());
     374    codeset = buf;
     375  
     376  #elif defined OS2
     377  
     378    const char *locale;
     379    static char buf[2 + 10 + 1];
     380    ULONG cp[3];
     381    ULONG cplen;
     382  
     383    /* Allow user to override the codeset, as set in the operating system,
     384       with standard language environment variables.  */
     385    locale = getenv ("LC_ALL");
     386    if (locale == NULL || locale[0] == '\0')
     387      {
     388        locale = getenv ("LC_CTYPE");
     389        if (locale == NULL || locale[0] == '\0')
     390  	locale = getenv ("LANG");
     391      }
     392    if (locale != NULL && locale[0] != '\0')
     393      {
     394        /* If the locale name contains an encoding after the dot, return it.  */
     395        const char *dot = strchr (locale, '.');
     396  
     397        if (dot != NULL)
     398  	{
     399  	  const char *modifier;
     400  
     401  	  dot++;
     402  	  /* Look for the possible @... trailer and remove it, if any.  */
     403  	  modifier = strchr (dot, '@');
     404  	  if (modifier == NULL)
     405  	    return dot;
     406  	  if (modifier - dot < sizeof (buf))
     407  	    {
     408  	      memcpy (buf, dot, modifier - dot);
     409  	      buf [modifier - dot] = '\0';
     410  	      return buf;
     411  	    }
     412  	}
     413  
     414        /* Resolve through the charset.alias file.  */
     415        codeset = locale;
     416      }
     417    else
     418      {
     419        /* OS/2 has a function returning the locale's codepage as a number.  */
     420        if (DosQueryCp (sizeof (cp), cp, &cplen))
     421  	codeset = "";
     422        else
     423  	{
     424  	  sprintf (buf, "CP%u", cp[0]);
     425  	  codeset = buf;
     426  	}
     427      }
     428  
     429  #endif
     430  
     431    return codeset;
     432  }
     433  
     434  const char *
     435  _g_locale_charset_unalias (const char *codeset)
     436  {
     437    const char *aliases;
     438  
     439    if (codeset == NULL)
     440      /* The canonical name cannot be determined.  */
     441      codeset = "";
     442  
     443    /* Resolve alias. */
     444    for (aliases = _g_locale_get_charset_aliases ();
     445         *aliases != '\0';
     446         aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
     447      if (strcmp (codeset, aliases) == 0
     448  	|| (aliases[0] == '*' && aliases[1] == '\0'))
     449        {
     450  	codeset = aliases + strlen (aliases) + 1;
     451  	break;
     452        }
     453  
     454    /* Don't return an empty string.  GNU libc and GNU libiconv interpret
     455       the empty string as denoting "the locale's character encoding",
     456       thus GNU libiconv would call this function a second time.  */
     457    if (codeset[0] == '\0')
     458      codeset = "ASCII";
     459  
     460    return codeset;
     461  }