1  
       2  /*
       3   * encode.c - string conversion routines (mostly for compatibility with
       4   *            udev/volume_id)
       5   *
       6   * Copyright (C) 2008 Kay Sievers <kay.sievers@vrfy.org>
       7   * Copyright (C) 2009 Karel Zak <kzak@redhat.com>
       8   *
       9   * This file may be redistributed under the terms of the
      10   * GNU Lesser General Public License.
      11   */
      12  #include <stdio.h>
      13  #include <stdlib.h>
      14  #include <stddef.h>
      15  #include <unistd.h>
      16  #include <errno.h>
      17  #include <string.h>
      18  #include <ctype.h>
      19  
      20  #include "blkidP.h"
      21  #include "strutils.h"
      22  
      23  /**
      24   * SECTION: encode
      25   * @title: Encoding utils
      26   * @short_description: encode strings to safe udev-compatible formats
      27   *
      28   */
      29  
      30  /* count of characters used to encode one unicode char */
      31  static int utf8_encoded_expected_len(const char *str)
      32  {
      33  	unsigned char c = (unsigned char)str[0];
      34  
      35  	if (c < 0x80)
      36  		return 1;
      37  	if ((c & 0xe0) == 0xc0)
      38  		return 2;
      39  	if ((c & 0xf0) == 0xe0)
      40  		return 3;
      41  	if ((c & 0xf8) == 0xf0)
      42  		return 4;
      43  	if ((c & 0xfc) == 0xf8)
      44  		return 5;
      45  	if ((c & 0xfe) == 0xfc)
      46  		return 6;
      47  	return 0;
      48  }
      49  
      50  /* decode one unicode char */
      51  static int utf8_encoded_to_unichar(const char *str)
      52  {
      53  	int unichar;
      54  	int len;
      55  	int i;
      56  
      57  	len = utf8_encoded_expected_len(str);
      58  	switch (len) {
      59  	case 1:
      60  		return (int)str[0];
      61  	case 2:
      62  		unichar = str[0] & 0x1f;
      63  		break;
      64  	case 3:
      65  		unichar = (int)str[0] & 0x0f;
      66  		break;
      67  	case 4:
      68  		unichar = (int)str[0] & 0x07;
      69  		break;
      70  	case 5:
      71  		unichar = (int)str[0] & 0x03;
      72  		break;
      73  	case 6:
      74  		unichar = (int)str[0] & 0x01;
      75  		break;
      76  	default:
      77  		return -1;
      78  	}
      79  
      80  	for (i = 1; i < len; i++) {
      81  		if (((int)str[i] & 0xc0) != 0x80)
      82  			return -1;
      83  		unichar <<= 6;
      84  		unichar |= (int)str[i] & 0x3f;
      85  	}
      86  
      87  	return unichar;
      88  }
      89  
      90  /* expected size used to encode one unicode char */
      91  static int utf8_unichar_to_encoded_len(int unichar)
      92  {
      93  	if (unichar < 0x80)
      94  		return 1;
      95  	if (unichar < 0x800)
      96  		return 2;
      97  	if (unichar < 0x10000)
      98  		return 3;
      99  	if (unichar < 0x200000)
     100  		return 4;
     101  	if (unichar < 0x4000000)
     102  		return 5;
     103  	return 6;
     104  }
     105  
     106  /* check if unicode char has a valid numeric range */
     107  static int utf8_unichar_valid_range(int unichar)
     108  {
     109  	if (unichar > 0x10ffff)
     110  		return 0;
     111  	if ((unichar & 0xfffff800) == 0xd800)
     112  		return 0;
     113  	if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
     114  		return 0;
     115  	if ((unichar & 0xffff) == 0xffff)
     116  		return 0;
     117  	return 1;
     118  }
     119  
     120  /* validate one encoded unicode char and return its length */
     121  static int utf8_encoded_valid_unichar(const char *str)
     122  {
     123  	int len;
     124  	int unichar;
     125  	int i;
     126  
     127  	len = utf8_encoded_expected_len(str);
     128  	if (len == 0)
     129  		return -1;
     130  
     131  	/* ascii is valid */
     132  	if (len == 1)
     133  		return 1;
     134  
     135  	/* check if expected encoded chars are available */
     136  	for (i = 0; i < len; i++)
     137  		if ((str[i] & 0x80) != 0x80)
     138  			return -1;
     139  
     140  	unichar = utf8_encoded_to_unichar(str);
     141  
     142  	/* check if encoded length matches encoded value */
     143  	if (utf8_unichar_to_encoded_len(unichar) != len)
     144  		return -1;
     145  
     146  	/* check if value has valid range */
     147  	if (!utf8_unichar_valid_range(unichar))
     148  		return -1;
     149  
     150  	return len;
     151  }
     152  
     153  static int is_whitelisted(char c, const char *white)
     154  {
     155  	if ((c >= '0' && c <= '9') ||
     156  	    (c >= 'A' && c <= 'Z') ||
     157  	    (c >= 'a' && c <= 'z') ||
     158  	    strchr("#+-.:=@_", c) != NULL ||
     159  	    (white != NULL && strchr(white, c) != NULL))
     160  		return 1;
     161  	return 0;
     162  }
     163  
     164  /**
     165   * blkid_encode_string:
     166   * @str: input string to be encoded
     167   * @str_enc: output string to store the encoded input string
     168   * @len: maximum size of the output string, which may be
     169   *       four times as long as the input string
     170   *
     171   * Encode all potentially unsafe characters of a string to the
     172   * corresponding hex value prefixed by '\x'.
     173   *
     174   * Returns: 0 if the entire string was copied, non-zero otherwise.
     175   **/
     176  int blkid_encode_string(const char *str, char *str_enc, size_t len)
     177  {
     178  	size_t i, j;
     179  
     180  	if (!str || !str_enc || !len)
     181  		return -1;
     182  
     183  	for (i = 0, j = 0; str[i] != '\0'; i++) {
     184  		int seqlen;
     185  
     186  		seqlen = utf8_encoded_valid_unichar(&str[i]);
     187  		if (seqlen > 1) {
     188  			if (len-j < (size_t)seqlen)
     189  				goto err;
     190  			memcpy(&str_enc[j], &str[i], seqlen);
     191  			j += seqlen;
     192  			i += (seqlen-1);
     193  		} else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) {
     194  			if (len-j < 4)
     195  				goto err;
     196  			sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
     197  			j += 4;
     198  		} else {
     199  			if (len-j < 1)
     200  				goto err;
     201  			str_enc[j] = str[i];
     202  			j++;
     203  		}
     204  		if (j+3 >= len)
     205  			goto err;
     206  	}
     207  	if (len-j < 1)
     208  		goto err;
     209  	str_enc[j] = '\0';
     210  	return 0;
     211  err:
     212  	return -1;
     213  }
     214  
     215  /**
     216   * blkid_safe_string:
     217   * @str: input string
     218   * @str_safe: output string
     219   * @len: size of output string
     220   *
     221   * Processing whitespace characters. Allows valid ascii,valid utf8.
     222   * Replace everything else with'_'
     223   *
     224   * Returns: 0 on success or -1 in case of error.
     225   */
     226  int blkid_safe_string(const char *str, char *str_safe, size_t len)
     227  {
     228  	size_t i = 0;
     229  
     230  	if (!str || !str_safe || !len)
     231  		return -1;
     232  
     233  	__normalize_whitespace(
     234  			(const unsigned char *) str, strnlen(str, len),
     235  			(unsigned char *) str_safe, len);
     236  
     237  	while (i < len && str_safe[i] != '\0') {
     238  		int seqsz;
     239  
     240  		/* accept ASCII from ' ' to '~' */
     241  		if (str_safe[i] > 0x20 && str_safe[i] <= 0x7E)
     242  			i++;
     243  
     244  		/* accept hex encoding */
     245  		else if (str_safe[i] == '\\' && str_safe[i+1] == 'x')
     246  			i += 2;
     247  
     248  		/* replace whitespace */
     249  		else if (isspace(str_safe[i]))
     250  			str_safe[i++] = '_';
     251  
     252  		/* accept valid utf8 */
     253  		else if ((seqsz = utf8_encoded_valid_unichar(&str_safe[i])) >= 1)
     254  			i += seqsz;
     255  
     256  		/* everything else is replaced with '_' */
     257  		else
     258  			str_safe[i++] = '_';
     259  	}
     260  
     261  	str_safe[len - 1] = '\0';
     262  	return 0;
     263  }