src/utf8.c

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  * Copyright © 2014 Ran Benita <ran234@gmail.com>
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  22  * DEALINGS IN THE SOFTWARE.
  23  *
  24  * Author: Rob Bradford <rob@linux.intel.com>
  25  */
  26
  27 #include "config.h"
  28
  29 #include <stddef.h>
  30 #include <stdbool.h>
  31 #include <inttypes.h>
  32
  33 #include "utf8.h"
  34
  35 /* Conformant encoding form conversion from UTF-32 to UTF-8.
  36  *
  37  * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
  38  * for further details.
  39 */
  40 int
  41 utf32_to_utf8(uint32_t unichar, char *buffer)
  42 {
  43     int count, shift, length;
  44     uint8_t head;
  45
  46     if (unichar <= 0x007f) {
  47         buffer[0] = unichar;
  48         buffer[1] = '\0';
  49         return 2;
  50     }
  51     else if (unichar <= 0x07FF) {
  52         length = 2;
  53         head = 0xc0;
  54     }
  55     /* Handle surrogates */
  56     else if (0xd800 <= unichar && unichar <= 0xdfff) {
  57         goto ill_formed_code_unit_subsequence;
  58     }
  59     else if (unichar <= 0xffff) {
  60         length = 3;
  61         head = 0xe0;
  62     }
  63     else if (unichar <= 0x10ffff) {
  64         length = 4;
  65         head = 0xf0;
  66     }
  67     else {
  68         goto ill_formed_code_unit_subsequence;
  69     }
  70
  71     for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
  72         buffer[count] = 0x80 | ((unichar >> shift) & 0x3f);
  73
  74     buffer[0] = head | ((unichar >> shift) & 0x3f);
  75     buffer[length] = '\0';
  76
  77     return length + 1;
  78
  79 ill_formed_code_unit_subsequence:
  80     buffer[0] = '\0';
  81     return 0;
  82 }
  83
  84 bool
  85 is_valid_utf8(const char *ss, size_t len)
  86 {
  87     size_t i = 0;
  88     size_t tail_bytes = 0;
  89     const uint8_t *s = (const uint8_t *) ss;
  90
  91     /* This beauty is from:
  92      *  The Unicode Standard Version 6.2 - Core Specification, Table 3.7
  93      *  https://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G7404
  94      * We can optimize if needed. */
  95     while (i < len)
  96     {
  97         if (s[i] <= 0x7F) {
  98             tail_bytes = 0;
  99         }
 100         else if (s[i] >= 0xC2 && s[i] <= 0xDF) {
 101             tail_bytes = 1;
 102         }
 103         else if (s[i] == 0xE0) {
 104             i++;
 105             if (i >= len || !(s[i] >= 0xA0 && s[i] <= 0xBF))
 106                 return false;
 107             tail_bytes = 1;
 108         }
 109         else if (s[i] >= 0xE1 && s[i] <= 0xEC) {
 110             tail_bytes = 2;
 111         }
 112         else if (s[i] == 0xED) {
 113             i++;
 114             if (i >= len || !(s[i] >= 0x80 && s[i] <= 0x9F))
 115                 return false;
 116             tail_bytes = 1;
 117         }
 118         else if (s[i] >= 0xEE && s[i] <= 0xEF) {
 119             tail_bytes = 2;
 120         }
 121         else if (s[i] == 0xF0) {
 122             i++;
 123             if (i >= len || !(s[i] >= 0x90 && s[i] <= 0xBF))
 124                 return false;
 125             tail_bytes = 2;
 126         }
 127         else if (s[i] >= 0xF1 && s[i] <= 0xF3) {
 128             tail_bytes = 3;
 129         }
 130         else if (s[i] == 0xF4) {
 131             i++;
 132             if (i >= len || !(s[i] >= 0x80 && s[i] <= 0x8F))
 133                 return false;
 134             tail_bytes = 2;
 135         }
 136         else {
 137             return false;
 138         }
 139
 140         i++;
 141
 142         while (i < len && tail_bytes > 0 && s[i] >= 0x80 && s[i] <= 0xBF) {
 143             i++;
 144             tail_bytes--;
 145         }
 146
 147         if (tail_bytes != 0)
 148             return false;
 149     }
 150
 151     return true;
 152 }