2 * testboundaries.c: Test text boundary algorithms
4 * Copyright (C) 1999-2000 Red Hat Software
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
27 #include <pango/pango.h>
29 #define CHFORMAT "%0#6x"
31 /* FIXME for now this just tests that the breaking of some sample
32 * text conforms to certain rules and invariants. But eventually
33 * we should also have test-result pairs, i.e. a string and some
34 * encoding of the correct way to break the string, to check
35 * more precisely that things worked
39 /* "virama script" is just an optimization; it includes a bunch of
40 * scripts without viramas in them
42 #define VIRAMA_SCRIPT(wc) ((wc) >= 0x0901 && (wc) <= 0x17FF)
43 #define VIRAMA(wc) ((wc) == 0x094D || \
57 /* Types of Japanese characters */
58 #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
59 #define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
60 #define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
61 #define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
63 static int offset = 0;
65 static gunichar current_wc = 0;
66 static const char *line_start = NULL;
67 static const char *line_end = NULL;
69 static void fail (const char *format, ...) G_GNUC_PRINTF (1, 2) G_GNUC_NORETURN;
70 static void fail (const char *format, ...)
77 va_start (args, format);
78 str = g_strdup_vprintf (format, args);
81 line_text = g_strndup (line_start, line_end - line_start);
83 fprintf (stderr, "line %d offset %d char is " CHFORMAT ": %s\n (line is '%s')\n", line, offset, current_wc, str, line_text);
90 typedef void (* CharForeachFunc) (gunichar wc,
94 GUnicodeType prev_type,
95 GUnicodeType next_type,
97 PangoLogAttr *prev_attr,
98 PangoLogAttr *next_attr,
102 log_attr_foreach (const char *text,
104 CharForeachFunc func,
107 const gchar *next = text;
108 gint length = strlen (text);
109 const gchar *end = text + length;
113 GUnicodeType prev_type;
114 GUnicodeType next_type;
122 prev_type = (GUnicodeType) -1;
125 next_wc = g_utf8_get_char (next);
126 next_type = g_unichar_type (next_wc);
141 next = g_utf8_next_char (next);
147 next_wc = g_utf8_get_char (next);
150 next_type = g_unichar_type (next_wc);
152 (* func) (wc, prev_wc, next_wc,
153 type, prev_type, next_type,
155 i != 0 ? &attrs[i-1] : NULL,
156 next_wc != 0 ? &attrs[i+1] : NULL,
174 check_line_char (gunichar wc,
178 GUnicodeType prev_type,
179 GUnicodeType next_type,
181 PangoLogAttr *prev_attr,
182 PangoLogAttr *next_attr,
185 GUnicodeBreakType break_type;
186 GUnicodeBreakType prev_break_type;
188 break_type = g_unichar_break_type (wc);
190 prev_break_type = g_unichar_break_type (prev_wc);
192 prev_break_type = G_UNICODE_BREAK_UNKNOWN;
198 if (attr->is_line_break)
199 fail ("line break between \\r and \\n");
202 if (next_attr && !next_attr->is_line_break)
203 fail ("no line break after \\n");
206 if (attr->is_line_break && prev_wc == 0)
207 fail ("first char in string should not be marked as a line break");
209 if (break_type == G_UNICODE_BREAK_SPACE)
211 if (attr->is_line_break && prev_attr != NULL &&
212 !attr->is_mandatory_break &&
213 !(next_wc && g_unichar_break_type (next_wc) == G_UNICODE_BREAK_COMBINING_MARK))
214 fail ("can't break lines before a space unless a mandatory break char precedes it or a combining mark follows; prev char was " CHFORMAT, prev_wc);
217 if (attr->is_mandatory_break && !attr->is_line_break)
218 fail ("mandatory breaks must also be marked as regular breaks");
222 /* FIXME use the break tables from break.c to automatically
223 * check invariants for each cell in the table. Shouldn't
224 * be that hard to do.
227 if (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
228 prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
229 attr->is_line_break &&
230 !attr->is_mandatory_break)
231 fail ("can't break between two open punctuation chars");
233 if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
234 prev_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION &&
235 attr->is_line_break &&
236 !attr->is_mandatory_break)
237 fail ("can't break between two close punctuation chars");
239 if (break_type == G_UNICODE_BREAK_QUOTATION &&
240 prev_break_type == G_UNICODE_BREAK_ALPHABETIC &&
241 attr->is_line_break &&
242 !attr->is_mandatory_break)
243 fail ("can't break letter-quotemark sequence");
247 check_line_invariants (const char *text,
250 log_attr_foreach (text, attrs, check_line_char, NULL);
254 check_word_invariants (const char *text,
262 check_sentence_invariants (const char *text,
270 check_grapheme_invariants (const char *text,
277 static void print_sentences (const char *text,
278 PangoLogAttr *attrs);
280 print_sentences (const char *text,
292 if (attrs[i].is_sentence_boundary)
294 char *s = g_strndup (last, p - last);
300 p = g_utf8_next_char (p);
306 check_invariants (const char *text)
311 if (!g_utf8_validate (text, -1, NULL))
312 fail ("Invalid UTF-8 in test text");
314 len = g_utf8_strlen (text, -1);
315 attrs = g_new0 (PangoLogAttr, len + 1);
317 pango_get_log_attrs (text,
320 pango_language_from_string ("C"),
324 check_line_invariants (text, attrs);
325 check_sentence_invariants (text, attrs);
326 check_grapheme_invariants (text, attrs);
327 check_word_invariants (text, attrs);
330 print_sentences (text, attrs);
337 main (int argc, char *argv[])
341 const gchar *filename;
343 g_setenv ("PANGO_RC_FILE", "./pangorc", TRUE);
345 srcdir = getenv ("srcdir");
349 filename = g_strdup_printf ("%s/boundaries.utf8", srcdir);
351 if (!g_file_get_contents (filename, &text, NULL, NULL))
352 fail ("Couldn't open sample text file");
354 check_invariants (text);
358 printf ("testboundaries passed\n");