gettext-tools/src/sentence.c

   1 /* Sentence handling.
   2    Copyright (C) 2015 Free Software Foundation, Inc.
   3    Written by Daiki Ueno <ueno@gnu.org>, 2015.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 /* Specification.  */
  23 #include "sentence.h"
  24
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include "unistr.h"
  28
  29
  30 /* The minimal number of white spaces which should follow after the
  31    end of sentence.  */
  32 int sentence_end_required_spaces = 1;
  33
  34 /* This function works in a similar way to 'forward-sentence' in
  35    Emacs, which basically does a regular expression matching of:
  36
  37      [.?!\u2026]
  38        []"'\u201d)}]*
  39          \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\)
  40
  41    Since we are lacking a regular expression routine capable of
  42    Unicode (though gnulib-lib/lib/regex.c provides a locale-dependent
  43    version, we would rather avoid depending on it), apply a manually
  44    constructed DFA, which consists of 8 states where 4 of them are a
  45    terminal.  */
  46 const char *
  47 sentence_end (const char *string, ucs4_t *ending_charp)
  48 {
  49   const char *str = string;
  50   const char *str_limit = string + strlen (str);
  51   /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal.  */
  52   int state = 0;
  53   /* Previous character before an end marker.  */
  54   ucs4_t ending_char = 0xfffd;
  55   /* Possible starting position of the match, and the next starting
  56      position if the current match fails.  */
  57   const char *match_start = NULL, *match_next = NULL;
  58   /* Number of spaces.  */
  59   int spaces = 0;
  60
  61   while (str <= str_limit)
  62     {
  63       ucs4_t uc;
  64       size_t length;
  65
  66       length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
  67
  68       if (state == 0)
  69         {
  70           switch (uc)
  71             {
  72             case '.': case '?': case '!': case 0x2026:
  73               state = 1;
  74               match_start = str;
  75               match_next = str + length;
  76               ending_char = uc;
  77               spaces = 0;
  78               break;
  79
  80             default:
  81               break;
  82             }
  83
  84           str += length;
  85           continue;
  86         }
  87
  88       if (state == 1)
  89         {
  90           switch (uc)
  91             {
  92             case ']': case '"': case '\'': case ')': case '}': case 0x201d:
  93               state = 2;
  94               break;
  95
  96             case '\0': case '\n':
  97               /* State 3.  */
  98               *ending_charp = ending_char;
  99               return match_start;
 100
 101             case ' ': case 0x00a0:
 102               if (++spaces == sentence_end_required_spaces)
 103                 {
 104                   /* State 7.  */
 105                   *ending_charp = ending_char;
 106                   return match_start;
 107                 }
 108               state = 4;
 109               break;
 110
 111             case '\t':
 112               /* State 5.  */
 113               *ending_charp = ending_char;
 114               return match_start;
 115
 116             default:
 117               str = match_next;
 118               state = 0;
 119               continue;
 120             }
 121
 122           str += length;
 123           continue;
 124         }
 125
 126       if (state == 2)
 127         {
 128           switch (uc)
 129             {
 130             case ']': case '"': case '\'': case ')': case '}': case 0x201d:
 131               break;
 132
 133             case '\0': case '\n':
 134               /* State 3.  */
 135               *ending_charp = ending_char;
 136               return match_start;
 137
 138             case ' ': case 0x00a0:
 139               if (++spaces == sentence_end_required_spaces)
 140                 {
 141                   /* State 7.  */
 142                   *ending_charp = ending_char;
 143                   return match_start;
 144                 }
 145               state = 4;
 146               break;
 147
 148             case '\t':
 149               /* State 5.  */
 150               *ending_charp = ending_char;
 151               return match_start;
 152
 153             default:
 154               state = 0;
 155               str = match_next;
 156               continue;
 157             }
 158
 159           str += length;
 160           continue;
 161         }
 162
 163       if (state == 4)
 164         {
 165           switch (uc)
 166             {
 167             case '\0': case '\n':
 168               /* State 6.  */
 169               *ending_charp = ending_char;
 170               return match_start;
 171
 172             case ' ': case 0x00a0:
 173               if (++spaces == sentence_end_required_spaces)
 174                 {
 175                   /* State 7.  */
 176                   *ending_charp = ending_char;
 177                   return match_start;
 178                 }
 179               break;
 180
 181             default:
 182               state = 0;
 183               str = match_next;
 184               continue;
 185             }
 186
 187           str += length;
 188           continue;
 189         }
 190     }
 191
 192   *ending_charp = 0xfffd;
 193   return str_limit;
 194 }