third-party/libunibreak/unibreakdef.c

   1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Break processing in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2015 Wu Yongwei <wuyongwei at gmail dot com>
   8  *
   9  * This software is provided 'as-is', without any express or implied
  10  * warranty.  In no event will the author be held liable for any damages
  11  * arising from the use of this software.
  12  *
  13  * Permission is granted to anyone to use this software for any purpose,
  14  * including commercial applications, and to alter it and redistribute
  15  * it freely, subject to the following restrictions:
  16  *
  17  * 1. The origin of this software must not be misrepresented; you must
  18  *    not claim that you wrote the original software.  If you use this
  19  *    software in a product, an acknowledgement in the product
  20  *    documentation would be appreciated but is not required.
  21  * 2. Altered source versions must be plainly marked as such, and must
  22  *    not be misrepresented as being the original software.
  23  * 3. This notice may not be removed or altered from any source
  24  *    distribution.
  25  */
  26
  27 /**
  28  * @file    unibreakdef.c
  29  *
  30  * Definition of utility functions used by the libunibreak library.
  31  *
  32  * @version 1.0, 2015/04/18
  33  * @author  Wu Yongwei
  34  */
  35
  36 #include <assert.h>
  37 #include <stddef.h>
  38 #include "unibreakdef.h"
  39
  40 /**
  41  * Gets the next Unicode character in a UTF-8 sequence.  The index will
  42  * be advanced to the next complete character, unless the end of string
  43  * is reached in the middle of a UTF-8 sequence.
  44  *
  45  * @param[in]     s    input UTF-8 string
  46  * @param[in]     len  length of the string in bytes
  47  * @param[in,out] ip   pointer to the index
  48  * @return             the Unicode character beginning at the index; or
  49  *                     #EOS if end of input is encountered
  50  */
  51 utf32_t ub_get_next_char_utf8(
  52         const utf8_t *s,
  53         size_t len,
  54         size_t *ip)
  55 {
  56     utf8_t ch;
  57     utf32_t res;
  58
  59     assert(*ip <= len);
  60     if (*ip == len)
  61         return EOS;
  62     ch = s[*ip];
  63
  64     if (ch < 0xC2 || ch > 0xF4)
  65     {   /* One-byte sequence, tail (should not occur), or invalid */
  66         *ip += 1;
  67         return ch;
  68     }
  69     else if (ch < 0xE0)
  70     {   /* Two-byte sequence */
  71         if (*ip + 2 > len)
  72             return EOS;
  73         res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
  74         *ip += 2;
  75         return res;
  76     }
  77     else if (ch < 0xF0)
  78     {   /* Three-byte sequence */
  79         if (*ip + 3 > len)
  80             return EOS;
  81         res = ((ch & 0x0F) << 12) +
  82               ((s[*ip + 1] & 0x3F) << 6) +
  83               ((s[*ip + 2] & 0x3F));
  84         *ip += 3;
  85         return res;
  86     }
  87     else
  88     {   /* Four-byte sequence */
  89         if (*ip + 4 > len)
  90             return EOS;
  91         res = ((ch & 0x07) << 18) +
  92               ((s[*ip + 1] & 0x3F) << 12) +
  93               ((s[*ip + 2] & 0x3F) << 6) +
  94               ((s[*ip + 3] & 0x3F));
  95         *ip += 4;
  96         return res;
  97     }
  98 }
  99
 100 /**
 101  * Gets the next Unicode character in a UTF-16 sequence.  The index will
 102  * be advanced to the next complete character, unless the end of string
 103  * is reached in the middle of a UTF-16 surrogate pair.
 104  *
 105  * @param[in]     s    input UTF-16 string
 106  * @param[in]     len  length of the string in words
 107  * @param[in,out] ip   pointer to the index
 108  * @return             the Unicode character beginning at the index; or
 109  *                     #EOS if end of input is encountered
 110  */
 111 utf32_t ub_get_next_char_utf16(
 112         const utf16_t *s,
 113         size_t len,
 114         size_t *ip)
 115 {
 116     utf16_t ch;
 117
 118     assert(*ip <= len);
 119     if (*ip == len)
 120         return EOS;
 121     ch = s[(*ip)++];
 122
 123     if (ch < 0xD800 || ch > 0xDBFF)
 124     {   /* If the character is not a high surrogate */
 125         return ch;
 126     }
 127     if (*ip == len)
 128     {   /* If the input ends here (an error) */
 129         --(*ip);
 130         return EOS;
 131     }
 132     if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
 133     {   /* If the next character is not the low surrogate (an error) */
 134         return ch;
 135     }
 136     /* Return the constructed character and advance the index again */
 137     return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
 138 }
 139
 140 /**
 141  * Gets the next Unicode character in a UTF-32 sequence.  The index will
 142  * be advanced to the next character.
 143  *
 144  * @param[in]     s    input UTF-32 string
 145  * @param[in]     len  length of the string in dwords
 146  * @param[in,out] ip   pointer to the index
 147  * @return             the Unicode character beginning at the index; or
 148  *                     #EOS if end of input is encountered
 149  */
 150 utf32_t ub_get_next_char_utf32(
 151         const utf32_t *s,
 152         size_t len,
 153         size_t *ip)
 154 {
 155     assert(*ip <= len);
 156     if (*ip == len)
 157         return EOS;
 158     return s[(*ip)++];
 159 }