shlibs/blkid/src/encode.c

   1
   2 /*
   3  * encode.c - string convertion routines (mostly for compatibility with
   4  *            udev/volume_id)
   5  *
   6  * Copyright (C) 2008 Kay Sievers <kay.sievers@vrfy.org>
   7  * Copyright (C) 2009 Karel Zak <kzak@redhat.com>
   8  *
   9  * This file may be redistributed under the terms of the
  10  * GNU Lesser General Public License.
  11  */
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <stddef.h>
  15 #include <unistd.h>
  16 #include <errno.h>
  17 #include <string.h>
  18 #include <ctype.h>
  19
  20 #include "blkdev.h"
  21 #include "blkidP.h"
  22
  23 #define UDEV_ALLOWED_CHARS_INPUT               "/ $%?,"
  24
  25 /* count of characters used to encode one unicode char */
  26 static int utf8_encoded_expected_len(const char *str)
  27 {
  28         unsigned char c = (unsigned char)str[0];
  29
  30         if (c < 0x80)
  31                 return 1;
  32         if ((c & 0xe0) == 0xc0)
  33                 return 2;
  34         if ((c & 0xf0) == 0xe0)
  35                 return 3;
  36         if ((c & 0xf8) == 0xf0)
  37                 return 4;
  38         if ((c & 0xfc) == 0xf8)
  39                 return 5;
  40         if ((c & 0xfe) == 0xfc)
  41                 return 6;
  42         return 0;
  43 }
  44
  45 /* decode one unicode char */
  46 static int utf8_encoded_to_unichar(const char *str)
  47 {
  48         int unichar;
  49         int len;
  50         int i;
  51
  52         len = utf8_encoded_expected_len(str);
  53         switch (len) {
  54         case 1:
  55                 return (int)str[0];
  56         case 2:
  57                 unichar = str[0] & 0x1f;
  58                 break;
  59         case 3:
  60                 unichar = (int)str[0] & 0x0f;
  61                 break;
  62         case 4:
  63                 unichar = (int)str[0] & 0x07;
  64                 break;
  65         case 5:
  66                 unichar = (int)str[0] & 0x03;
  67                 break;
  68         case 6:
  69                 unichar = (int)str[0] & 0x01;
  70                 break;
  71         default:
  72                 return -1;
  73         }
  74
  75         for (i = 1; i < len; i++) {
  76                 if (((int)str[i] & 0xc0) != 0x80)
  77                         return -1;
  78                 unichar <<= 6;
  79                 unichar |= (int)str[i] & 0x3f;
  80         }
  81
  82         return unichar;
  83 }
  84
  85 /* expected size used to encode one unicode char */
  86 static int utf8_unichar_to_encoded_len(int unichar)
  87 {
  88         if (unichar < 0x80)
  89                 return 1;
  90         if (unichar < 0x800)
  91                 return 2;
  92         if (unichar < 0x10000)
  93                 return 3;
  94         if (unichar < 0x200000)
  95                 return 4;
  96         if (unichar < 0x4000000)
  97                 return 5;
  98         return 6;
  99 }
 100
 101 /* check if unicode char has a valid numeric range */
 102 static int utf8_unichar_valid_range(int unichar)
 103 {
 104         if (unichar > 0x10ffff)
 105                 return 0;
 106         if ((unichar & 0xfffff800) == 0xd800)
 107                 return 0;
 108         if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
 109                 return 0;
 110         if ((unichar & 0xffff) == 0xffff)
 111                 return 0;
 112         return 1;
 113 }
 114
 115 /* validate one encoded unicode char and return its length */
 116 static int utf8_encoded_valid_unichar(const char *str)
 117 {
 118         int len;
 119         int unichar;
 120         int i;
 121
 122         len = utf8_encoded_expected_len(str);
 123         if (len == 0)
 124                 return -1;
 125
 126         /* ascii is valid */
 127         if (len == 1)
 128                 return 1;
 129
 130         /* check if expected encoded chars are available */
 131         for (i = 0; i < len; i++)
 132                 if ((str[i] & 0x80) != 0x80)
 133                         return -1;
 134
 135         unichar = utf8_encoded_to_unichar(str);
 136
 137         /* check if encoded length matches encoded value */
 138         if (utf8_unichar_to_encoded_len(unichar) != len)
 139                 return -1;
 140
 141         /* check if value has valid range */
 142         if (!utf8_unichar_valid_range(unichar))
 143                 return -1;
 144
 145         return len;
 146 }
 147
 148 static int replace_whitespace(const char *str, char *to, size_t len)
 149 {
 150         size_t i, j;
 151
 152         /* strip trailing whitespace */
 153         len = strnlen(str, len);
 154         while (len && isspace(str[len-1]))
 155                 len--;
 156
 157         /* strip leading whitespace */
 158         i = 0;
 159         while (isspace(str[i]) && (i < len))
 160                 i++;
 161
 162         j = 0;
 163         while (i < len) {
 164                 /* substitute multiple whitespace with a single '_' */
 165                 if (isspace(str[i])) {
 166                         while (isspace(str[i]))
 167                                 i++;
 168                         to[j++] = '_';
 169                 }
 170                 to[j++] = str[i++];
 171         }
 172         to[j] = '\0';
 173         return 0;
 174 }
 175
 176 static int is_whitelisted(char c, const char *white)
 177 {
 178         if ((c >= '0' && c <= '9') ||
 179             (c >= 'A' && c <= 'Z') ||
 180             (c >= 'a' && c <= 'z') ||
 181             strchr("#+-.:=@_", c) != NULL ||
 182             (white != NULL && strchr(white, c) != NULL))
 183                 return 1;
 184         return 0;
 185 }
 186
 187 /* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */
 188 static int replace_chars(char *str, const char *white)
 189 {
 190         size_t i = 0;
 191         int replaced = 0;
 192
 193         while (str[i] != '\0') {
 194                 int len;
 195
 196                 if (is_whitelisted(str[i], white)) {
 197                         i++;
 198                         continue;
 199                 }
 200
 201                 /* accept hex encoding */
 202                 if (str[i] == '\\' && str[i+1] == 'x') {
 203                         i += 2;
 204                         continue;
 205                 }
 206
 207                 /* accept valid utf8 */
 208                 len = utf8_encoded_valid_unichar(&str[i]);
 209                 if (len > 1) {
 210                         i += len;
 211                         continue;
 212                 }
 213
 214                 /* if space is allowed, replace whitespace with ordinary space */
 215                 if (isspace(str[i]) && white != NULL && strchr(white, ' ') != NULL) {
 216                         str[i] = ' ';
 217                         i++;
 218                         replaced++;
 219                         continue;
 220                 }
 221
 222                 /* everything else is replaced with '_' */
 223                 str[i] = '_';
 224                 i++;
 225                 replaced++;
 226         }
 227         return replaced;
 228 }
 229
 230 /**
 231  * blkid_encode_string:
 232  * @str: input string to be encoded
 233  * @str_enc: output string to store the encoded input string
 234  * @len: maximum size of the output string, which may be
 235  *       four times as long as the input string
 236  *
 237  * Encode all potentially unsafe characters of a string to the
 238  * corresponding hex value prefixed by '\x'.
 239  *
 240  * Returns: 0 if the entire string was copied, non-zero otherwise.
 241  **/
 242 int blkid_encode_string(const char *str, char *str_enc, size_t len)
 243 {
 244         size_t i, j;
 245
 246         if (str == NULL || str_enc == NULL)
 247                 return -1;
 248
 249         for (i = 0, j = 0; str[i] != '\0'; i++) {
 250                 int seqlen;
 251
 252                 seqlen = utf8_encoded_valid_unichar(&str[i]);
 253                 if (seqlen > 1) {
 254                         if (len-j < (size_t)seqlen)
 255                                 goto err;
 256                         memcpy(&str_enc[j], &str[i], seqlen);
 257                         j += seqlen;
 258                         i += (seqlen-1);
 259                 } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) {
 260                         if (len-j < 4)
 261                                 goto err;
 262                         sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
 263                         j += 4;
 264                 } else {
 265                         if (len-j < 1)
 266                                 goto err;
 267                         str_enc[j] = str[i];
 268                         j++;
 269                 }
 270                 if (j+3 >= len)
 271                         goto err;
 272         }
 273         if (len-j < 1)
 274                 goto err;
 275         str_enc[j] = '\0';
 276         return 0;
 277 err:
 278         return -1;
 279 }
 280
 281 /**
 282  * blkid_safe_string:
 283  * @str: input string
 284  * @str_safe: output string
 285  * @len: size of output string
 286  *
 287  * Allows plain ascii, hex-escaping and valid utf8. Replaces all whitespaces
 288  * with '_'.
 289  */
 290 int blkid_safe_string(const char *str, char *str_safe, size_t len)
 291 {
 292         replace_whitespace(str, str_safe, len);
 293         replace_chars(str_safe, UDEV_ALLOWED_CHARS_INPUT);
 294         return 0;
 295 }