Bump to 1.14.1
[platform/upstream/augeas.git] / lib / uniwbrk / u-wordbreaks.h
1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings.  -*- coding: utf-8 -*-
2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU Lesser General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18 void
19 FUNC (const UNIT *s, size_t n, char *p)
20 {
21   if (n > 0)
22     {
23       const UNIT *s_end = s + n;
24
25       /* Word break property of the last character.
26          -1 at the very beginning of the string.  */
27       int last_char_prop = -1;
28
29       /* Format and Extend characters are ignored; this means, the mostly used
30          unit is the complex character (= character with subsequent ignored
31          characters).
32          Word break property of the last complex character.
33          -1 at the very beginning of the string.  */
34       int last_compchar_prop = -1;
35       char *last_compchar_ptr = NULL;
36
37       /* For recognizing rules involving 3 complex characters:
38          Word break property of the second-to-last complex character.
39          -1 at the very beginning of the string.  */
40       int secondlast_compchar_prop = -1;
41
42       /* Don't break inside multibyte characters.  */
43       memset (p, 0, n);
44
45       while (s < s_end)
46         {
47           ucs4_t uc;
48           int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
49           int prop = uc_wordbreak_property (uc);
50
51           /* No break at the start of the string.  */
52           if (last_char_prop >= 0)
53             {
54               /* No break between CR and LF.  */
55               if (last_char_prop == WBP_CR && prop == WBP_LF)
56                 /* *p = 0 */;
57               /* Break before and after newlines.  */
58               else if ((last_char_prop == WBP_CR
59                         || last_char_prop == WBP_LF
60                         || last_char_prop == WBP_NEWLINE)
61                        || (prop == WBP_CR
62                            || prop == WBP_LF
63                            || prop == WBP_NEWLINE))
64                 *p = 1;
65               /* Ignore Format and Extend characters.  */
66               else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
67                 {
68                   /* No break in these situations (see UAX #29):
69
70                       secondlast          last             current
71
72     (ALetter | HL)   (MidLetter | MidNumLet | SQ) × (ALetter | HL)      (WB7)
73     (ALetter | HL) × (MidLetter | MidNumLet | SQ)   (ALetter | HL)      (WB6)
74                   Numeric   (MidNum | MidNumLet | SQ)    × Numeric      (WB11)
75                   Numeric × (MidNum | MidNumLet | SQ)      Numeric      (WB12)
76                                                         HL × DQ HL      (WB7b)
77                                                         HL DQ × HL      (WB7c)
78                                    (ALetter | HL) × (ALetter | HL)      (WB5)
79                                           (ALetter | HL) × Numeric      (WB9)
80                                           Numeric × (ALetter | HL)      (WB10)
81                                                  Numeric × Numeric      (WB8)
82                                                       HL × SQ           (WB7a)
83                                                 Katakana × Katakana     (WB13)
84                      (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
85                                             ExtendNumLet × ExtendNumLet (WB13a)
86                     ExtendNumLet × (ALetter | HL | Numeric | Katakana)  (WB13b)
87                                Regional_Indicator × Regional_Indicator  (WB13c)
88                    */
89                   /* No break across certain punctuation.  Also, disable word
90                      breaks that were recognized earlier (due to lookahead of
91                      only one complex character).  */
92                   if (((prop == WBP_ALETTER
93                         || prop == WBP_HL)
94                        && (last_compchar_prop == WBP_MIDLETTER
95                            || last_compchar_prop == WBP_MIDNUMLET
96                            || last_compchar_prop == WBP_SQ)
97                        && (secondlast_compchar_prop == WBP_ALETTER
98                            || secondlast_compchar_prop == WBP_HL))
99                       || (prop == WBP_NUMERIC
100                           && (last_compchar_prop == WBP_MIDNUM
101                               || last_compchar_prop == WBP_MIDNUMLET
102                               || last_compchar_prop == WBP_SQ)
103                           && secondlast_compchar_prop == WBP_NUMERIC)
104                       || (prop == WBP_HL
105                           && last_compchar_prop == WBP_DQ
106                           && secondlast_compchar_prop == WBP_HL))
107                     {
108                       *last_compchar_ptr = 0;
109                       /* *p = 0; */
110                     }
111                   /* Break after Format and Extend characters.  */
112                   else if (last_compchar_prop == WBP_EXTEND
113                            || last_compchar_prop == WBP_FORMAT)
114                     *p = 1;
115                   else
116                     {
117                       /* Normalize property value to table index,
118                          skipping 5 properties: WBP_EXTEND,
119                          WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
120                          WBP_LF.  */
121                       int last_compchar_prop_index = last_compchar_prop;
122                       int prop_index = prop;
123
124                       if (last_compchar_prop_index >= WBP_EXTEND)
125                         last_compchar_prop_index -= 5;
126
127                       if (prop_index >= WBP_EXTEND)
128                         prop_index -= 5;
129
130                       /* Perform a single table lookup.  */
131                       if (uniwbrk_table[last_compchar_prop_index][prop_index])
132                         *p = 1;
133                       /* else *p = 0; */
134                     }
135                 }
136             }
137
138           last_char_prop = prop;
139           /* Ignore Format and Extend characters, except at the start
140              of the line.  */
141           if (last_compchar_prop < 0
142               || last_compchar_prop == WBP_CR
143               || last_compchar_prop == WBP_LF
144               || last_compchar_prop == WBP_NEWLINE
145               || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
146             {
147               secondlast_compchar_prop = last_compchar_prop;
148               last_compchar_prop = prop;
149               last_compchar_ptr = p;
150             }
151
152           s += count;
153           p += count;
154         }
155     }
156 }