2 * testboundaries_ucd.c: Test text boundary algorithms with test data from
3 * Unicode Character Database.
5 * Copyright (C) 2003 Noah Levitt
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public
18 * License along with this library; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
23 #include <pango/pango.h>
28 static gboolean failed = FALSE;
30 /* PangoLogAttr has to be the same size as guint or this hack breaks */
38 /* counts the number of multiplication and divison signs up to the first
39 * '#' or null character */
41 count_attrs (gchar *line)
49 ch = g_utf8_get_char (p);
53 /* MULTIPLICATION SIGN, DIVISION SIGN */
54 case 0x00d7: case 0x00f7:
58 /* null char, NUMBER SIGN */
59 case 0x0000: case 0x0023:
66 p = g_utf8_next_char (p);
72 parse_line (gchar *line,
75 PangoLogAttr **attr_return,
79 gunichar ch, character;
84 *num_attrs = count_attrs (line);
85 *attr_return = g_new (PangoLogAttr, *num_attrs);
89 gs = g_string_new (NULL);
95 /* skip white space */
98 ch = g_utf8_get_char (p);
99 p = g_utf8_next_char (p);
101 while (g_unichar_isspace (ch));
105 case 0x00f7: /* DIVISION SIGN: boundary here */
106 temp_attr.bits |= bits.bits;
109 case 0x00d7: /* MULTIPLICATION SIGN: no boundary here */
114 *str_return = g_string_free (gs, FALSE);
117 default: /* unexpected character */
118 g_free (*attr_return);
122 (*attr_return)[i] = temp_attr.attr;
124 /* skip white space */
127 ch = g_utf8_get_char (p);
128 p = g_utf8_next_char (p);
130 while (g_unichar_isspace (ch));
131 p = g_utf8_prev_char (p);
133 if (ch == 0x0023 || ch == 0x0000)
135 *str_return = g_string_free (gs, FALSE);
139 character = strtoul (p, &q, 16);
140 if (q < p + 4 || q > p + 6 || character > 0x10ffff)
142 g_free (*attr_return);
148 gs = g_string_append_unichar (gs, character);
155 attrs_equal (PangoLogAttr *attrs1,
156 PangoLogAttr *attrs2,
163 for (i = 0; i < len; i++)
171 /* can't do a straight comparison because the bitmask may have
172 * multiple bits set, and as long as attr&bitmask is not zero, it
173 * counts as being set */
174 if (((a.bits & bits.bits) && !(b.bits & bits.bits)) ||
175 !(a.bits & bits.bits) && (b.bits & bits.bits))
183 make_test_string (gchar *string,
187 GString *gs = g_string_new (NULL);
197 if ((a.bits & bits.bits) != 0)
198 gs = g_string_append_unichar (gs, 0x00f7);
200 gs = g_string_append_unichar (gs, 0x00d7);
202 g_string_append_c (gs, ' ');
207 ch = g_utf8_get_char (p);
208 g_string_append_printf (gs, "%04X ", ch);
210 p = g_utf8_next_char (p);
214 return g_string_free (gs, FALSE);
218 do_test (gchar *filename,
220 gboolean fixup_broken_linebreaktest)
225 gsize length, terminator_pos;
228 PangoLogAttr *expected_attrs;
233 channel = g_io_channel_new_file (filename, "r", &error);
236 if (error->domain == G_FILE_ERROR && error->code == G_FILE_ERROR_NOENT)
238 g_print ("%s not found. Skipping test.\n", filename);
243 g_printerr ("%s: %s\n", filename, error->message);
252 status = g_io_channel_read_line (channel, &line, &length, &terminator_pos, &error);
256 case G_IO_STATUS_ERROR:
257 g_printerr ("%s: %s\n", filename, error->message);
260 case G_IO_STATUS_EOF:
263 case G_IO_STATUS_AGAIN:
266 case G_IO_STATUS_NORMAL:
267 line[terminator_pos] = '\0';
271 if (! parse_line (line, bits, &string, &expected_attrs, &num_attrs))
273 g_printerr ("%s: error parsing line %d: %s\n", filename, i, line);
279 PangoLogAttr *attrs = g_new (PangoLogAttr, num_attrs);
280 pango_get_log_attrs (string, -1, 0, pango_language_from_string ("C"), attrs, num_attrs);
282 /* LineBreakTest.txt from Unicode 5.1.0 has this bug that it says
283 * breaking is allowed at the beginning of the strings, while the
284 * algorithm says it's not. Fix that up. */
285 if (fixup_broken_linebreaktest)
286 memset (expected_attrs, 0, sizeof (expected_attrs[0]));
288 if (! attrs_equal (attrs, expected_attrs, num_attrs, bits))
290 gchar *str = make_test_string (string, attrs, bits);
291 gchar *comments = strchr (line, '#');
292 if (comments) /* don't print the # comment in the error message. print it separately */
302 g_printerr ("%s: line %d failed\n"
306 filename, i, line, str, comments);
314 g_free (expected_attrs);
321 g_io_channel_unref (channel);
323 g_error_free (error);
335 setlocale (LC_ALL, "");
337 srcdir = getenv ("srcdir");
341 filename = g_strdup_printf ("%s/GraphemeBreakTest.txt", srcdir);
343 bits.attr.is_cursor_position = 1;
344 do_test (filename, bits, FALSE);
346 filename = g_strdup_printf ("%s/WordBreakTest.txt", srcdir);
348 bits.attr.is_word_boundary = 1;
349 do_test (filename, bits, FALSE);
351 filename = g_strdup_printf ("%s/SentenceBreakTest.txt", srcdir);
353 bits.attr.is_sentence_boundary = 1;
354 do_test (filename, bits, FALSE);
356 filename = g_strdup_printf ("%s/LineBreakTest.txt", srcdir);
358 bits.attr.is_line_break = 1;
359 bits.attr.is_mandatory_break = 1;
360 do_test (filename, bits, TRUE);