tests/enchant-ispell.c

   1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /* enchant
   3  * Copyright (C) 2003 Dom Lachowicz
   4  *               2007 Hannu Väisänen
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02110-1301, USA.
  20  *
  21  * In addition, as a special exception, Dom Lachowicz
  22  * gives permission to link the code of this program with
  23  * the non-LGPL Spelling Provider libraries (eg: a MSFT Office
  24  * spell checker backend) and distribute linked combinations including
  25  * the two.  You must obey the GNU Lesser General Public License in all
  26  * respects for all of the code used other than said providers. If you modify
  27  * this file, you may extend this exception to your version of the
  28  * file, but you are not obligated to do so. If you do not wish to
  29  * do so, delete this exception statement from your version.
  30  */
  31
  32 /**
  33  * This is a rough approximation of an "ispell compatibility mode"
  34  * for Enchant.
  35  *
  36  * Modified in 2007 to work when called from emacs which
  37  * calls a spelling program (e.g. enchant) like this
  38  *
  39  * enchant -a -m -d dictionary
  40  */
  41
  42 #include <stdio.h>
  43 #include <stdlib.h>
  44 #include <string.h>
  45 #include <locale.h>
  46 #include <glib.h>
  47
  48 #include "enchant.h"
  49 #include "enchant-provider.h"
  50
  51 /* word has to be bigger than this to be checked */
  52 #define MIN_WORD_LENGTH 1
  53
  54 #ifdef WIN32
  55 #define WIN32_LEAN_AND_MEAN
  56 #include <windows.h>
  57
  58 static char charset[15] = "CP437";
  59 #endif
  60
  61 typedef enum
  62         {
  63                 MODE_NONE,
  64                 MODE_VERSION,
  65                 MODE_A,
  66                 MODE_L
  67         } IspellMode_t;
  68
  69 static void
  70 print_version (FILE * to)
  71 {
  72         fprintf (to, "@(#) International Ispell Version 3.1.20 (but really Enchant %s)\n", VERSION);
  73         fflush (to);
  74 }
  75
  76 static void
  77 print_help (FILE * to, const char * prog)
  78 {
  79         fprintf (to, "Usage: %s [options] -a|-d dict|-l|-L|-m|-v[v]|<file>\n", prog);
  80         fprintf (to, "\t-a lists alternatives.\n");
  81         fprintf (to, "\t-d dict uses dictionary <dict>.\n");
  82         fprintf (to, "\t-l lists misspellings.\n");
  83         fprintf (to, "\t-m is ignored.\n");
  84         fprintf (to, "\t-L displays line numbers.\n");
  85         fprintf (to, "\t-v displays program version.\n");
  86 }
  87
  88 static gboolean
  89 consume_line (FILE * in, GString * str)
  90 {
  91         int ch;
  92         gsize bytes_read, bytes_written;
  93         gchar * utf;
  94         gboolean ret = TRUE;
  95
  96         g_string_truncate (str, 0);
  97
  98         while (ret && (ch = fgetc (in)) != EOF) {
  99                 if (ch == '\r')
 100                         continue;
 101                 else {
 102                         g_string_append_c (str, ch);
 103                         if (ch == '\n')
 104                                 ret = FALSE;
 105                 }
 106         }
 107
 108         if (str->len) {
 109 #ifdef WIN32
 110                 utf = g_convert(str->str, str->len, "UTF-8", charset, &bytes_read, &bytes_written, NULL);
 111 #else
 112                 utf = g_locale_to_utf8 (str->str, str->len, &bytes_read, &bytes_written, NULL);
 113 #endif
 114
 115                 if (utf) {
 116                         g_string_assign (str, utf);
 117                         g_free (utf);
 118                 }
 119                 /* Else str->str stays the same. we'll assume that it's
 120                    already utf8 and glib is just being stupid. */
 121         }
 122
 123         return ret;
 124 }
 125
 126 static void
 127 print_utf (FILE * out, const char * str)
 128 {
 129         gsize bytes_read, bytes_written;
 130         gchar * native;
 131
 132         native = g_locale_from_utf8 (str, -1, &bytes_read, &bytes_written, NULL);
 133         if (native) {
 134                 fwrite (native, 1, bytes_written, out);
 135                 g_free (native);
 136         } else {
 137                 /* We'll assume that it's already utf8 and glib is just being stupid. */
 138                 fwrite (str, 1, strlen (str), out);
 139         }
 140 }
 141
 142 static void
 143 do_mode_a (FILE * out, EnchantDict * dict, GString * word, size_t start_pos, size_t lineCount)
 144 {
 145         size_t n_suggs;
 146         char ** suggs;
 147
 148         if (word->len <= MIN_WORD_LENGTH || enchant_dict_check (dict, word->str, word->len) == 0) {
 149                 if (lineCount)
 150                         fprintf (out, "* %u\n", (unsigned int)lineCount);
 151                 else
 152                         fwrite ("*\n", 1, 2, out);
 153         }
 154         else {
 155                 suggs = enchant_dict_suggest (dict, word->str,
 156                                               word->len, &n_suggs);
 157                 if (!n_suggs || !suggs) {
 158                         fwrite ("# ", 1, 2, out);
 159                         if (lineCount)
 160                                 fprintf (out, "%u ", (unsigned int)lineCount);
 161                         print_utf (out, word->str);
 162                         fprintf (out, " %u\n", (unsigned int)start_pos);
 163                 }
 164                 else {
 165                         size_t i = 0;
 166
 167                         fwrite ("& ", 1, 2, out);
 168                         if (lineCount)
 169                                 fprintf (out, "%u ", (unsigned int)lineCount);
 170                         print_utf (out, word->str);
 171                         fprintf (out, " %u %u:", (unsigned int)n_suggs, (unsigned int)start_pos);
 172
 173                         for (i = 0; i < n_suggs; i++) {
 174                                 fprintf (out, " ");
 175                                 print_utf (out, suggs[i]);
 176
 177                                 if (i != (n_suggs - 1))
 178                                         fwrite (",", 1, 1, out);
 179                                 else
 180                                         fwrite ("\n", 1, 1, out);
 181                         }
 182
 183                         enchant_dict_free_string_list (dict, suggs);
 184                 }
 185         }
 186 }
 187
 188 static void
 189 do_mode_l (FILE * out, EnchantDict * dict, GString * word, size_t lineCount)
 190 {
 191         if (enchant_dict_check (dict, word->str, word->len) != 0) {
 192                 if (lineCount)
 193                         fprintf (out, "%u ", (unsigned int)lineCount);
 194                 print_utf (out, word->str);
 195                 fwrite ("\n", 1, 1, out);
 196         }
 197 }
 198
 199
 200 static int
 201 is_word_char (gunichar uc, size_t n)
 202 {
 203         GUnicodeType type;
 204
 205         type = g_unichar_type(uc);
 206
 207         switch (type) {
 208         case G_UNICODE_MODIFIER_LETTER:
 209         case G_UNICODE_LOWERCASE_LETTER:
 210         case G_UNICODE_TITLECASE_LETTER:
 211         case G_UNICODE_UPPERCASE_LETTER:
 212         case G_UNICODE_OTHER_LETTER:
 213         case G_UNICODE_COMBINING_MARK:
 214         case G_UNICODE_ENCLOSING_MARK:
 215         case G_UNICODE_NON_SPACING_MARK:
 216         case G_UNICODE_DECIMAL_NUMBER:
 217         case G_UNICODE_LETTER_NUMBER:
 218         case G_UNICODE_OTHER_NUMBER:
 219         case G_UNICODE_CONNECT_PUNCTUATION:
 220                 return 1;     /* Enchant 1.3.0 defines word chars like this. */
 221
 222         case G_UNICODE_CONTROL:
 223         case G_UNICODE_FORMAT:
 224         case G_UNICODE_UNASSIGNED:
 225         case G_UNICODE_PRIVATE_USE:
 226         case G_UNICODE_SURROGATE:
 227         case G_UNICODE_DASH_PUNCTUATION:
 228         case G_UNICODE_CLOSE_PUNCTUATION:
 229         case G_UNICODE_FINAL_PUNCTUATION:
 230         case G_UNICODE_INITIAL_PUNCTUATION:
 231         case G_UNICODE_OTHER_PUNCTUATION:
 232         case G_UNICODE_OPEN_PUNCTUATION:
 233         case G_UNICODE_CURRENCY_SYMBOL:
 234         case G_UNICODE_MODIFIER_SYMBOL:
 235         case G_UNICODE_MATH_SYMBOL:
 236         case G_UNICODE_OTHER_SYMBOL:
 237         case G_UNICODE_LINE_SEPARATOR:
 238         case G_UNICODE_PARAGRAPH_SEPARATOR:
 239         case G_UNICODE_SPACE_SEPARATOR:
 240         default:
 241                 if ((n > 0) && (uc == g_utf8_get_char("'"))) {
 242                         return 1;  /** Char ' is accepted only within a word. */
 243                 }
 244                 else if ((n > 0) && (type == G_UNICODE_DASH_PUNCTUATION)) {
 245                         return 1; /* hyphens only accepted within a word. */
 246                 }
 247
 248                 return 0;
 249         }
 250 }
 251
 252
 253 typedef struct lang_map {
 254   const char *ispell;
 255   const char *enchant;
 256 } LangMap;
 257
 258
 259 /* Maps ispell language codes to enchant language codes. */
 260 /* The list is partially taken from src/ispell/ispell_checker.cpp. */
 261 static const LangMap lingua[] = {
 262         {"american",    "en_US"},
 263         {"brazilian",   "pt_BR"},
 264         {"british",     "en_GB"},
 265         {"bulgarian",   "bg"},
 266         {"catala",      "ca"},
 267         {"catalan",     "ca"},
 268         {"danish",      "da"},
 269         {"dansk",       "da"},
 270         {"deutsch",     "de"},
 271         {"dutch",       "nl"},
 272         {"ellhnika",    "el"},
 273         {"espanol",     "es"},
 274         {"esperanto",   "eo"},
 275         {"estonian",    "et"},
 276         {"faeroese",    "fo"},
 277         {"finnish",     "fi"},
 278         {"francais",    "fr"},
 279         {"french",      "fr"},
 280         {"galician",    "gl"},
 281         {"german",      "de"},
 282         {"hungarian",   "hu"},
 283         {"interlingua", "ia"},
 284         {"irish",       "ga"},
 285         {"italian",     "it"},
 286         {"latin",       "la"},
 287         {"lietuviu",    "lt"},
 288         {"lithuanian",  "lt"},
 289         {"mlatin",      "la"},
 290         {"nederlands",  "nl"},
 291         {"norsk",       "no"},
 292         {"norwegian",   "no"},
 293         {"nynorsk",     "nn"},
 294         {"polish",      "pl"},
 295         {"portugues",   "pt"},
 296         {"portuguese",  "pt"},
 297         {"russian",     "ru"},
 298         {"sardinian",   "sc"},
 299         {"slovak",      "sk"},
 300         {"slovenian",   "sl"},
 301         {"slovensko",   "sl"},
 302         {"spanish",     "es"},
 303         {"suomi",       "fi"},   /* For Emacs/Voikko/tmispell compatibility. */
 304         {"svenska",     "sv"},
 305         {"swedish",     "sv"},
 306         {"swiss",       "de_CH"},
 307         {"ukrainian",   "uk"},
 308         {"yiddish-yivo",        "yi"},
 309         {NULL,       NULL}    /* Last item must be {NULL, NULL}. */
 310 };
 311
 312
 313 /* Converts ispell language code to enchant language code. */
 314 static gchar *
 315 convert_language_code (gchar *code)
 316 {
 317         size_t i;
 318         for (i = 0; lingua[i].ispell; i++) {
 319                 if (!strcmp(code,lingua[i].ispell)) {
 320                         /* We must call g_strdup() because the calling program g_free()s the result. */
 321                         return g_strdup (lingua[i].enchant);
 322                 }
 323         }
 324         /* Let's call g_strdup() here too! */
 325         return g_strdup (code);
 326 }
 327
 328
 329 /* Splits a line into a set of (word,word_position) touples. */
 330 static GSList *
 331 tokenize_line (GString * line)
 332 {
 333         GSList * tokens = NULL;
 334         char *utf = (char *) line->str;
 335
 336         GString * word;
 337
 338         gunichar uc;
 339         size_t cur_pos = 0;
 340         size_t start_pos = 0;
 341         word = g_string_new (NULL);
 342
 343         while (cur_pos < line->len && *utf) {
 344     int i;
 345                 /* Skip non-word characters. */
 346                 cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
 347                 uc = g_utf8_get_char (utf);
 348                 while (cur_pos < line->len && *utf && !is_word_char(uc,0)) {
 349                         utf = g_utf8_next_char (utf);
 350                         uc = g_utf8_get_char (utf);
 351                         cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
 352                 }
 353                 start_pos = cur_pos;
 354
 355                 /* Skip over word. */
 356                 while (cur_pos < line->len && *utf && is_word_char(uc,1)) {
 357                         g_string_append_unichar (word, uc);
 358                         utf = g_utf8_next_char (utf);
 359                         uc = g_utf8_get_char (utf);
 360                         cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
 361                 }
 362
 363                 /* Do not accept one or more  ' at the end of the word. */
 364                 i = word->len-1;
 365                 while ((i >= 0) && (word->str[i] == '\'')) {
 366                         g_string_truncate (word, i);
 367                         i--;
 368                 }
 369
 370                 /* Save (word, position) touple. */
 371                 if (word->len) {
 372                         tokens = g_slist_append (tokens, g_string_new_len (word->str, word->len));
 373                         tokens = g_slist_append (tokens, GINT_TO_POINTER(start_pos));
 374                         g_string_truncate (word, 0);
 375                 }
 376         }
 377         g_string_free (word, TRUE);
 378
 379         return tokens;
 380 }
 381
 382 static int
 383 parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dictionary)
 384 {
 385         EnchantBroker * broker;
 386         EnchantDict * dict;
 387
 388         GString * str, * word = NULL;
 389         GSList * tokens, *token_ptr;
 390         gchar * lang;
 391         size_t pos, lineCount = 0;
 392
 393         gboolean was_last_line = FALSE, corrected_something = FALSE;
 394
 395         if (mode == MODE_A)
 396                 print_version (out);
 397
 398         if (dictionary) {
 399                 lang = convert_language_code (dictionary);
 400         }
 401         else {
 402                 lang = enchant_get_user_language();
 403                 if(!lang)
 404                         return 1;
 405         }
 406
 407         /* Enchant will get rid of useless trailing garbage like de_DE@euro or de_DE.ISO-8859-15 */
 408
 409         broker = enchant_broker_init ();
 410         dict = enchant_broker_request_dict (broker, lang);
 411
 412         if (!dict) {
 413                 fprintf (stderr, "Couldn't create a dictionary for %s\n", lang);
 414                 g_free (lang);
 415                 enchant_broker_free (broker);
 416                 return 1;
 417         }
 418
 419         g_free (lang);
 420
 421         str = g_string_new (NULL);
 422
 423         while (!was_last_line) {
 424                 was_last_line = consume_line (in, str);
 425
 426                 if (countLines)
 427                         lineCount++;
 428
 429                 if (str->len) {
 430                         corrected_something = FALSE;
 431                         token_ptr = tokens = tokenize_line (str);
 432                         while (tokens != NULL) {
 433                                 corrected_something = TRUE;
 434
 435                                 word = (GString *)tokens->data;
 436                                 tokens = tokens->next;
 437                                 pos = GPOINTER_TO_INT(tokens->data);
 438                                 tokens = tokens->next;
 439
 440                                 if (mode == MODE_A)
 441                                         do_mode_a (out, dict, word, pos, lineCount);
 442                                 else if (mode == MODE_L)
 443                                         do_mode_l (out, dict, word, lineCount);
 444
 445                                 g_string_free(word, TRUE);
 446                         }
 447                         if (token_ptr)
 448                                 g_slist_free (token_ptr);
 449                 }
 450
 451                 if (mode == MODE_A && corrected_something) {
 452                         fwrite ("\n", 1, 1, out);
 453                 }
 454                 g_string_truncate (str, 0);
 455                 fflush (out);
 456         }
 457
 458         enchant_broker_free_dict (broker, dict);
 459         enchant_broker_free (broker);
 460
 461         g_string_free (str, TRUE);
 462
 463         return 0;
 464 }
 465
 466 int main (int argc, char ** argv)
 467 {
 468         IspellMode_t mode = MODE_NONE;
 469
 470         char * file = NULL;
 471         int i, rval = 0;
 472
 473         FILE * fp = stdin;
 474
 475         int countLines = 0;
 476         gchar *dictionary = 0;  /* -d dictionary */
 477
 478         /* Initialize system locale */
 479         setlocale(LC_ALL, "");
 480
 481 #ifdef WIN32
 482         /* Workaround about glib's "locale" not being the set C locale */
 483         if (GetFileType(GetStdHandle(STD_INPUT_HANDLE)) != FILE_TYPE_CHAR) {
 484                 sprintf_s(charset,15,"CP%u",GetACP());
 485         } else {
 486                 sprintf_s(charset,15,"CP%u",GetConsoleCP());
 487         }
 488 #endif
 489
 490         for (i = 1; i < argc; i++) {
 491                 char * arg = argv[i];
 492                 if (arg[0] == '-') {
 493                         if (strlen (arg) == 2) {
 494                                 /* It seems that the first one of these that is specified gets precedence. */
 495                                 if (arg[1] == 'a' && MODE_NONE == mode)
 496                                         mode = MODE_A;
 497                                 else if (arg[1] == 'l' && MODE_NONE == mode)
 498                                         mode = MODE_L;
 499                                 else if (arg[1] == 'v' && MODE_NONE == mode)
 500                                         mode = MODE_VERSION;
 501                                 else if (arg[1] == 'L' && MODE_NONE == mode)
 502                                         countLines = 1;
 503                                 else if (arg[1] == 'm')
 504                                         ; /* Ignore. Emacs calls ispell with '-m'. */
 505                                 else if (arg[1] == 'd') {
 506                                         i++;
 507                                         dictionary = argv[i];  /* Emacs calls ispell with '-d dictionary'. */
 508                                 }
 509                         }
 510                         else if ((strlen (arg) == 3) && (arg[1] == 'v') && (arg[2] == 'v')) {
 511                                 mode = MODE_VERSION;   /* Emacs (or ispell.el) calls [ai]spell with '-vv'. */
 512                         }
 513                         else if (arg[1] == 'd') {
 514                                 dictionary = arg + 2;  /* Accept "-ddictionary", i.e. no space between -d and dictionary. */
 515                         }
 516                         else if (strlen (arg) > 2) {
 517                                 fprintf (stderr, "-%c does not take any parameters.\n", arg[1]);
 518                                 exit(1);
 519                         }
 520                         else
 521                                 file = arg;
 522                 }
 523                 else
 524                         file = arg;
 525         }
 526
 527         if (mode == MODE_VERSION) {
 528                 print_version (stdout);
 529         }
 530         else if (mode == MODE_NONE && !file) {
 531                 print_help (stdout, argv[0]);
 532         }
 533         else {
 534                 if (file) {
 535                         fp = enchant_fopen (file, "rb");
 536                         if (!fp) {
 537                                 fprintf (stderr, "Error: Could not open the file \"%s\" for reading.\n", file);
 538                                 exit (1);
 539                         }
 540                 }
 541
 542                 rval = parse_file (fp, stdout, mode, countLines, dictionary);
 543
 544                 if (file)
 545                         fclose (fp);
 546         }
 547
 548         return rval;
 549 }