2 Copyright (C) 2015 Free Software Foundation, Inc.
3 Written by Daiki Ueno <ueno@gnu.org>, 2015.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
30 /* The minimal number of white spaces which should follow after the
32 int sentence_end_required_spaces = 1;
34 /* This function works in a similar way to 'forward-sentence' in
35 Emacs, which basically does a regular expression matching of:
39 \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\)
41 Since we are lacking a regular expression routine capable of
42 Unicode (though gnulib-lib/lib/regex.c provides a locale-dependent
43 version, we would rather avoid depending on it), apply a manually
44 constructed DFA, which consists of 8 states where 4 of them are a
47 sentence_end (const char *string, ucs4_t *ending_charp)
49 const char *str = string;
50 const char *str_limit = string + strlen (str);
51 /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal. */
53 /* Previous character before an end marker. */
54 ucs4_t ending_char = 0xfffd;
55 /* Possible starting position of the match, and the next starting
56 position if the current match fails. */
57 const char *match_start = NULL, *match_next = NULL;
58 /* Number of spaces. */
61 while (str <= str_limit)
66 length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
72 case '.': case '?': case '!': case 0x2026:
75 match_next = str + length;
92 case ']': case '"': case '\'': case ')': case '}': case 0x201d:
98 *ending_charp = ending_char;
101 case ' ': case 0x00a0:
102 if (++spaces == sentence_end_required_spaces)
105 *ending_charp = ending_char;
113 *ending_charp = ending_char;
130 case ']': case '"': case '\'': case ')': case '}': case 0x201d:
133 case '\0': case '\n':
135 *ending_charp = ending_char;
138 case ' ': case 0x00a0:
139 if (++spaces == sentence_end_required_spaces)
142 *ending_charp = ending_char;
150 *ending_charp = ending_char;
167 case '\0': case '\n':
169 *ending_charp = ending_char;
172 case ' ': case 0x00a0:
173 if (++spaces == sentence_end_required_spaces)
176 *ending_charp = ending_char;
192 *ending_charp = 0xfffd;