1 /* gmarkup.c - Simple XML-like parser
3 * Copyright 2000, 2003 Red Hat, Inc.
5 * GLib is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * GLib is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with GLib; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
35 g_markup_error_quark (void)
37 static GQuark error_quark = 0;
40 error_quark = g_quark_from_static_string ("g-markup-error-quark");
48 STATE_AFTER_OPEN_ANGLE,
49 STATE_AFTER_CLOSE_ANGLE,
50 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
51 STATE_INSIDE_OPEN_TAG_NAME,
52 STATE_INSIDE_ATTRIBUTE_NAME,
53 STATE_AFTER_ATTRIBUTE_NAME,
54 STATE_BETWEEN_ATTRIBUTES,
55 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
56 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
57 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
59 STATE_AFTER_CLOSE_TAG_SLASH,
60 STATE_INSIDE_CLOSE_TAG_NAME,
61 STATE_AFTER_CLOSE_TAG_NAME,
62 STATE_INSIDE_PASSTHROUGH,
66 struct _GMarkupParseContext
68 const GMarkupParser *parser;
70 GMarkupParseFlags flags;
76 GDestroyNotify dnotify;
78 /* A piece of character data or an element that
79 * hasn't "ended" yet so we haven't yet called
80 * the callback for it.
82 GString *partial_chunk;
84 GMarkupParseState state;
91 const gchar *current_text;
92 gssize current_text_len;
93 const gchar *current_text_end;
95 GString *leftover_char_portion;
97 /* used to save the start of the last interesting thingy */
102 guint document_empty : 1;
108 * g_markup_parse_context_new:
109 * @parser: a #GMarkupParser
110 * @flags: one or more #GMarkupParseFlags
111 * @user_data: user data to pass to #GMarkupParser functions
112 * @user_data_dnotify: user data destroy notifier called when the parse context is freed
114 * Creates a new parse context. A parse context is used to parse
115 * marked-up documents. You can feed any number of documents into
116 * a context, as long as no errors occur; once an error occurs,
117 * the parse context can't continue to parse text (you have to free it
118 * and create a new parse context).
120 * Return value: a new #GMarkupParseContext
122 GMarkupParseContext *
123 g_markup_parse_context_new (const GMarkupParser *parser,
124 GMarkupParseFlags flags,
126 GDestroyNotify user_data_dnotify)
128 GMarkupParseContext *context;
130 g_return_val_if_fail (parser != NULL, NULL);
132 context = g_new (GMarkupParseContext, 1);
134 context->parser = parser;
135 context->flags = flags;
136 context->user_data = user_data;
137 context->dnotify = user_data_dnotify;
139 context->line_number = 1;
140 context->char_number = 1;
142 context->partial_chunk = NULL;
144 context->state = STATE_START;
145 context->tag_stack = NULL;
146 context->attr_names = NULL;
147 context->attr_values = NULL;
148 context->cur_attr = -1;
149 context->alloc_attrs = 0;
151 context->current_text = NULL;
152 context->current_text_len = -1;
153 context->current_text_end = NULL;
154 context->leftover_char_portion = NULL;
156 context->start = NULL;
157 context->iter = NULL;
159 context->document_empty = TRUE;
160 context->parsing = FALSE;
162 context->balance = 0;
168 * g_markup_parse_context_free:
169 * @context: a #GMarkupParseContext
171 * Frees a #GMarkupParseContext. Can't be called from inside
172 * one of the #GMarkupParser functions.
176 g_markup_parse_context_free (GMarkupParseContext *context)
178 g_return_if_fail (context != NULL);
179 g_return_if_fail (!context->parsing);
181 if (context->dnotify)
182 (* context->dnotify) (context->user_data);
184 g_strfreev (context->attr_names);
185 g_strfreev (context->attr_values);
187 g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
188 g_slist_free (context->tag_stack);
190 if (context->partial_chunk)
191 g_string_free (context->partial_chunk, TRUE);
193 if (context->leftover_char_portion)
194 g_string_free (context->leftover_char_portion, TRUE);
200 mark_error (GMarkupParseContext *context,
203 context->state = STATE_ERROR;
205 if (context->parser->error)
206 (*context->parser->error) (context, error, context->user_data);
210 set_error (GMarkupParseContext *context,
220 va_start (args, format);
221 s = g_strdup_vprintf (format, args);
224 tmp_error = g_error_new (G_MARKUP_ERROR,
226 _("Error on line %d char %d: %s"),
227 context->line_number,
228 context->char_number,
233 mark_error (context, tmp_error);
235 g_propagate_error (error, tmp_error);
239 /* To make these faster, we first use the ascii-only tests, then check
240 * for the usual non-alnum name-end chars, and only then call the
241 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
242 * names, so this is a reasonable hack that virtually always avoids
245 #define IS_COMMON_NAME_END_CHAR(c) \
246 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
249 is_name_start_char (gunichar c)
251 if (g_ascii_isalpha (c) ||
252 (!IS_COMMON_NAME_END_CHAR (c) &&
253 (g_unichar_isalpha (c) ||
262 is_name_char (gunichar c)
264 if (g_ascii_isalnum (c) ||
265 (!IS_COMMON_NAME_END_CHAR (c) &&
266 (g_unichar_isalnum (c) ||
278 char_str (gunichar c,
282 g_unichar_to_utf8 (c, buf);
287 utf8_str (const gchar *utf8,
290 char_str (g_utf8_get_char (utf8), buf);
295 set_unescape_error (GMarkupParseContext *context,
297 const gchar *remaining_text,
298 const gchar *remaining_text_end,
306 gint remaining_newlines;
309 remaining_newlines = 0;
311 while (p != remaining_text_end)
314 ++remaining_newlines;
318 va_start (args, format);
319 s = g_strdup_vprintf (format, args);
322 tmp_error = g_error_new (G_MARKUP_ERROR,
324 _("Error on line %d: %s"),
325 context->line_number - remaining_newlines,
330 mark_error (context, tmp_error);
332 g_propagate_error (error, tmp_error);
338 USTATE_AFTER_AMPERSAND,
339 USTATE_INSIDE_ENTITY_NAME,
340 USTATE_AFTER_CHARREF_HASH
345 GMarkupParseContext *context;
349 const gchar *text_end;
350 const gchar *entity_start;
354 unescape_text_state_inside_text (UnescapeContext *ucontext,
359 gboolean normalize_attribute;
361 if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
362 ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
363 normalize_attribute = TRUE;
365 normalize_attribute = FALSE;
369 while (p != ucontext->text_end)
375 else if (normalize_attribute && (*p == '\t' || *p == '\n'))
377 g_string_append_len (ucontext->str, start, p - start);
378 g_string_append_c (ucontext->str, ' ');
379 p = g_utf8_next_char (p);
384 g_string_append_len (ucontext->str, start, p - start);
385 g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
386 p = g_utf8_next_char (p);
387 if (p != ucontext->text_end && *p == '\n')
388 p = g_utf8_next_char (p);
392 p = g_utf8_next_char (p);
396 g_string_append_len (ucontext->str, start, p - start);
398 if (p != ucontext->text_end && *p == '&')
400 p = g_utf8_next_char (p);
401 ucontext->state = USTATE_AFTER_AMPERSAND;
408 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
412 ucontext->entity_start = NULL;
416 p = g_utf8_next_char (p);
418 ucontext->entity_start = p;
419 ucontext->state = USTATE_AFTER_CHARREF_HASH;
421 else if (!is_name_start_char (g_utf8_get_char (p)))
425 set_unescape_error (ucontext->context, error,
426 p, ucontext->text_end,
427 G_MARKUP_ERROR_PARSE,
428 _("Empty entity '&;' seen; valid "
429 "entities are: & " < > '"));
435 set_unescape_error (ucontext->context, error,
436 p, ucontext->text_end,
437 G_MARKUP_ERROR_PARSE,
438 _("Character '%s' is not valid at "
439 "the start of an entity name; "
440 "the & character begins an entity; "
441 "if this ampersand isn't supposed "
442 "to be an entity, escape it as "
449 ucontext->entity_start = p;
450 ucontext->state = USTATE_INSIDE_ENTITY_NAME;
457 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
461 #define MAX_ENT_LEN 5
462 gchar buf[MAX_ENT_LEN+1] = {
463 '\0', '\0', '\0', '\0', '\0', '\0'
467 while (p != ucontext->text_end)
471 else if (!is_name_char (*p))
475 set_unescape_error (ucontext->context, error,
476 p, ucontext->text_end,
477 G_MARKUP_ERROR_PARSE,
478 _("Character '%s' is not valid "
479 "inside an entity name"),
484 p = g_utf8_next_char (p);
487 if (ucontext->context->state != STATE_ERROR)
489 if (p != ucontext->text_end)
493 src = ucontext->entity_start;
502 /* move to after semicolon */
503 p = g_utf8_next_char (p);
504 ucontext->state = USTATE_INSIDE_TEXT;
506 if (strcmp (buf, "lt") == 0)
507 g_string_append_c (ucontext->str, '<');
508 else if (strcmp (buf, "gt") == 0)
509 g_string_append_c (ucontext->str, '>');
510 else if (strcmp (buf, "amp") == 0)
511 g_string_append_c (ucontext->str, '&');
512 else if (strcmp (buf, "quot") == 0)
513 g_string_append_c (ucontext->str, '"');
514 else if (strcmp (buf, "apos") == 0)
515 g_string_append_c (ucontext->str, '\'');
518 set_unescape_error (ucontext->context, error,
519 p, ucontext->text_end,
520 G_MARKUP_ERROR_PARSE,
521 _("Entity name '%s' is not known"),
527 set_unescape_error (ucontext->context, error,
528 /* give line number of the & */
529 ucontext->entity_start, ucontext->text_end,
530 G_MARKUP_ERROR_PARSE,
531 _("Entity did not end with a semicolon; "
532 "most likely you used an ampersand "
533 "character without intending to start "
534 "an entity - escape ampersand as &"));
543 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
547 gboolean is_hex = FALSE;
550 start = ucontext->entity_start;
555 p = g_utf8_next_char (p);
559 while (p != ucontext->text_end && *p != ';')
560 p = g_utf8_next_char (p);
562 if (p != ucontext->text_end)
564 g_assert (*p == ';');
566 /* digit is between start and p */
570 gchar *digit = g_strndup (start, p - start);
573 gchar *digit_end = digit + (p - start);
577 l = strtoul (digit, &end, 16);
579 l = strtoul (digit, &end, 10);
581 if (end != digit_end || errno != 0)
583 set_unescape_error (ucontext->context, error,
584 start, ucontext->text_end,
585 G_MARKUP_ERROR_PARSE,
586 _("Failed to parse '%s', which "
587 "should have been a digit "
588 "inside a character reference "
589 "(ê for example) - perhaps "
590 "the digit is too large"),
595 /* characters XML permits */
599 (l >= 0x20 && l <= 0xD7FF) ||
600 (l >= 0xE000 && l <= 0xFFFD) ||
601 (l >= 0x10000 && l <= 0x10FFFF))
604 g_string_append (ucontext->str, char_str (l, buf));
608 set_unescape_error (ucontext->context, error,
609 start, ucontext->text_end,
610 G_MARKUP_ERROR_PARSE,
611 _("Character reference '%s' does not encode a permitted character"),
618 /* Move to next state */
619 p = g_utf8_next_char (p); /* past semicolon */
620 ucontext->state = USTATE_INSIDE_TEXT;
624 set_unescape_error (ucontext->context, error,
625 start, ucontext->text_end,
626 G_MARKUP_ERROR_PARSE,
627 _("Empty character reference; "
628 "should include a digit such as "
634 set_unescape_error (ucontext->context, error,
635 start, ucontext->text_end,
636 G_MARKUP_ERROR_PARSE,
637 _("Character reference did not end with a "
639 "most likely you used an ampersand "
640 "character without intending to start "
641 "an entity - escape ampersand as &"));
648 unescape_text (GMarkupParseContext *context,
650 const gchar *text_end,
654 UnescapeContext ucontext;
657 ucontext.context = context;
658 ucontext.text = text;
659 ucontext.text_end = text_end;
660 ucontext.entity_start = NULL;
662 ucontext.str = g_string_sized_new (text_end - text);
664 ucontext.state = USTATE_INSIDE_TEXT;
667 while (p != text_end && context->state != STATE_ERROR)
669 g_assert (p < text_end);
671 switch (ucontext.state)
673 case USTATE_INSIDE_TEXT:
675 p = unescape_text_state_inside_text (&ucontext,
681 case USTATE_AFTER_AMPERSAND:
683 p = unescape_text_state_after_ampersand (&ucontext,
690 case USTATE_INSIDE_ENTITY_NAME:
692 p = unescape_text_state_inside_entity_name (&ucontext,
698 case USTATE_AFTER_CHARREF_HASH:
700 p = unescape_text_state_after_charref_hash (&ucontext,
707 g_assert_not_reached ();
712 if (context->state != STATE_ERROR)
714 switch (ucontext.state)
716 case USTATE_INSIDE_TEXT:
718 case USTATE_AFTER_AMPERSAND:
719 case USTATE_INSIDE_ENTITY_NAME:
720 set_unescape_error (context, error,
722 G_MARKUP_ERROR_PARSE,
723 _("Unfinished entity reference"));
725 case USTATE_AFTER_CHARREF_HASH:
726 set_unescape_error (context, error,
728 G_MARKUP_ERROR_PARSE,
729 _("Unfinished character reference"));
734 if (context->state == STATE_ERROR)
736 g_string_free (ucontext.str, TRUE);
742 *unescaped = ucontext.str;
747 static inline gboolean
748 advance_char (GMarkupParseContext *context)
750 context->iter = g_utf8_next_char (context->iter);
751 context->char_number += 1;
753 if (context->iter == context->current_text_end)
757 else if (*context->iter == '\n')
759 context->line_number += 1;
760 context->char_number = 1;
766 static inline gboolean
769 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
773 skip_spaces (GMarkupParseContext *context)
777 if (!xml_isspace (*context->iter))
780 while (advance_char (context));
784 advance_to_name_end (GMarkupParseContext *context)
788 if (!is_name_char (g_utf8_get_char (context->iter)))
791 while (advance_char (context));
795 add_to_partial (GMarkupParseContext *context,
796 const gchar *text_start,
797 const gchar *text_end)
799 if (context->partial_chunk == NULL)
800 context->partial_chunk = g_string_sized_new (text_end - text_start);
802 if (text_start != text_end)
803 g_string_append_len (context->partial_chunk, text_start,
804 text_end - text_start);
806 /* Invariant here that partial_chunk exists */
810 truncate_partial (GMarkupParseContext *context)
812 if (context->partial_chunk != NULL)
814 context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
819 current_element (GMarkupParseContext *context)
821 return context->tag_stack->data;
825 current_attribute (GMarkupParseContext *context)
827 g_assert (context->cur_attr >= 0);
828 return context->attr_names[context->cur_attr];
832 find_current_text_end (GMarkupParseContext *context)
834 /* This function must be safe (non-segfaulting) on invalid UTF8.
835 * It assumes the string starts with a character start
837 const gchar *end = context->current_text + context->current_text_len;
841 g_assert (context->current_text_len > 0);
843 p = g_utf8_find_prev_char (context->current_text, end);
845 g_assert (p != NULL); /* since current_text was a char start */
847 /* p is now the start of the last character or character portion. */
849 next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
853 /* whole character */
854 context->current_text_end = end;
859 context->leftover_char_portion = g_string_new_len (p, end - p);
860 context->current_text_len -= (end - p);
861 context->current_text_end = p;
867 add_attribute (GMarkupParseContext *context, char *name)
869 if (context->cur_attr + 2 >= context->alloc_attrs)
871 context->alloc_attrs += 5; /* silly magic number */
872 context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
873 context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
876 context->attr_names[context->cur_attr] = name;
877 context->attr_values[context->cur_attr] = NULL;
878 context->attr_names[context->cur_attr+1] = NULL;
879 context->attr_values[context->cur_attr+1] = NULL;
883 * g_markup_parse_context_parse:
884 * @context: a #GMarkupParseContext
885 * @text: chunk of text to parse
886 * @text_len: length of @text in bytes
887 * @error: return location for a #GError
889 * Feed some data to the #GMarkupParseContext. The data need not
890 * be valid UTF-8; an error will be signaled if it's invalid.
891 * The data need not be an entire document; you can feed a document
892 * into the parser incrementally, via multiple calls to this function.
893 * Typically, as you receive data from a network connection or file,
894 * you feed each received chunk of data into this function, aborting
895 * the process if an error occurs. Once an error is reported, no further
896 * data may be fed to the #GMarkupParseContext; all errors are fatal.
898 * Return value: %FALSE if an error occurred, %TRUE on success
901 g_markup_parse_context_parse (GMarkupParseContext *context,
906 const gchar *first_invalid;
908 g_return_val_if_fail (context != NULL, FALSE);
909 g_return_val_if_fail (text != NULL, FALSE);
910 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
911 g_return_val_if_fail (!context->parsing, FALSE);
914 text_len = strlen (text);
919 context->parsing = TRUE;
921 if (context->leftover_char_portion)
923 const gchar *first_char;
925 if ((*text & 0xc0) != 0x80)
928 first_char = g_utf8_find_next_char (text, text + text_len);
932 /* leftover_char_portion was completed. Parse it. */
933 GString *portion = context->leftover_char_portion;
935 g_string_append_len (context->leftover_char_portion,
936 text, first_char - text);
938 /* hacks to allow recursion */
939 context->parsing = FALSE;
940 context->leftover_char_portion = NULL;
942 if (!g_markup_parse_context_parse (context,
943 portion->str, portion->len,
946 g_assert (context->state == STATE_ERROR);
949 g_string_free (portion, TRUE);
950 context->parsing = TRUE;
952 /* Skip the fraction of char that was in this text */
953 text_len -= (first_char - text);
958 /* another little chunk of the leftover char; geez
959 * someone is inefficient.
961 g_string_append_len (context->leftover_char_portion,
964 if (context->leftover_char_portion->len > 7)
966 /* The leftover char portion is too big to be
971 G_MARKUP_ERROR_BAD_UTF8,
972 _("Invalid UTF-8 encoded text"));
979 context->current_text = text;
980 context->current_text_len = text_len;
981 context->iter = context->current_text;
982 context->start = context->iter;
984 /* Nothing left after finishing the leftover char, or nothing
985 * passed in to begin with.
987 if (context->current_text_len == 0)
990 /* find_current_text_end () assumes the string starts at
991 * a character start, so we need to validate at least
992 * that much. It doesn't assume any following bytes
995 if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
999 G_MARKUP_ERROR_BAD_UTF8,
1000 _("Invalid UTF-8 encoded text"));
1004 /* Initialize context->current_text_end, possibly adjusting
1005 * current_text_len, and add any leftover char portion
1007 find_current_text_end (context);
1009 /* Validate UTF8 (must be done after we find the end, since
1010 * we could have a trailing incomplete char)
1012 if (!g_utf8_validate (context->current_text,
1013 context->current_text_len,
1018 p = context->current_text;
1019 while (p != context->current_text_end)
1026 context->line_number += newlines;
1030 G_MARKUP_ERROR_BAD_UTF8,
1031 _("Invalid UTF-8 encoded text"));
1035 while (context->iter != context->current_text_end)
1037 switch (context->state)
1040 /* Possible next state: AFTER_OPEN_ANGLE */
1042 g_assert (context->tag_stack == NULL);
1044 /* whitespace is ignored outside of any elements */
1045 skip_spaces (context);
1047 if (context->iter != context->current_text_end)
1049 if (*context->iter == '<')
1051 /* Move after the open angle */
1052 advance_char (context);
1054 context->state = STATE_AFTER_OPEN_ANGLE;
1056 /* this could start a passthrough */
1057 context->start = context->iter;
1059 /* document is now non-empty */
1060 context->document_empty = FALSE;
1066 G_MARKUP_ERROR_PARSE,
1067 _("Document must begin with an element (e.g. <book>)"));
1072 case STATE_AFTER_OPEN_ANGLE:
1073 /* Possible next states: INSIDE_OPEN_TAG_NAME,
1074 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
1076 if (*context->iter == '?' ||
1077 *context->iter == '!')
1079 /* include < in the passthrough */
1080 const gchar *openangle = "<";
1081 add_to_partial (context, openangle, openangle + 1);
1082 context->start = context->iter;
1083 context->balance = 1;
1084 context->state = STATE_INSIDE_PASSTHROUGH;
1086 else if (*context->iter == '/')
1089 advance_char (context);
1091 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
1093 else if (is_name_start_char (g_utf8_get_char (context->iter)))
1095 context->state = STATE_INSIDE_OPEN_TAG_NAME;
1097 /* start of tag name */
1098 context->start = context->iter;
1105 G_MARKUP_ERROR_PARSE,
1106 _("'%s' is not a valid character following "
1107 "a '<' character; it may not begin an "
1109 utf8_str (context->iter, buf));
1113 /* The AFTER_CLOSE_ANGLE state is actually sort of
1114 * broken, because it doesn't correspond to a range
1115 * of characters in the input stream as the others do,
1116 * and thus makes things harder to conceptualize
1118 case STATE_AFTER_CLOSE_ANGLE:
1119 /* Possible next states: INSIDE_TEXT, STATE_START */
1120 if (context->tag_stack == NULL)
1122 context->start = NULL;
1123 context->state = STATE_START;
1127 context->start = context->iter;
1128 context->state = STATE_INSIDE_TEXT;
1132 case STATE_AFTER_ELISION_SLASH:
1133 /* Possible next state: AFTER_CLOSE_ANGLE */
1136 /* We need to pop the tag stack and call the end_element
1137 * function, since this is the close tag
1139 GError *tmp_error = NULL;
1141 g_assert (context->tag_stack != NULL);
1144 if (context->parser->end_element)
1145 (* context->parser->end_element) (context,
1146 context->tag_stack->data,
1152 mark_error (context, tmp_error);
1153 g_propagate_error (error, tmp_error);
1157 if (*context->iter == '>')
1159 /* move after the close angle */
1160 advance_char (context);
1161 context->state = STATE_AFTER_CLOSE_ANGLE;
1168 G_MARKUP_ERROR_PARSE,
1169 _("Odd character '%s', expected a '>' character "
1170 "to end the start tag of element '%s'"),
1171 utf8_str (context->iter, buf),
1172 current_element (context));
1176 g_free (context->tag_stack->data);
1177 context->tag_stack = g_slist_delete_link (context->tag_stack,
1178 context->tag_stack);
1182 case STATE_INSIDE_OPEN_TAG_NAME:
1183 /* Possible next states: BETWEEN_ATTRIBUTES */
1185 /* if there's a partial chunk then it's the first part of the
1186 * tag name. If there's a context->start then it's the start
1187 * of the tag name in current_text, the partial chunk goes
1188 * before that start though.
1190 advance_to_name_end (context);
1192 if (context->iter == context->current_text_end)
1194 /* The name hasn't necessarily ended. Merge with
1195 * partial chunk, leave state unchanged.
1197 add_to_partial (context, context->start, context->iter);
1201 /* The name has ended. Combine it with the partial chunk
1202 * if any; push it on the stack; enter next state.
1204 add_to_partial (context, context->start, context->iter);
1205 context->tag_stack =
1206 g_slist_prepend (context->tag_stack,
1207 g_string_free (context->partial_chunk,
1210 context->partial_chunk = NULL;
1212 context->state = STATE_BETWEEN_ATTRIBUTES;
1213 context->start = NULL;
1217 case STATE_INSIDE_ATTRIBUTE_NAME:
1218 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1220 advance_to_name_end (context);
1221 add_to_partial (context, context->start, context->iter);
1223 /* read the full name, if we enter the equals sign state
1224 * then add the attribute to the list (without the value),
1225 * otherwise store a partial chunk to be prepended later.
1227 if (context->iter != context->current_text_end)
1228 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1231 case STATE_AFTER_ATTRIBUTE_NAME:
1232 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1234 skip_spaces (context);
1236 if (context->iter != context->current_text_end)
1238 /* The name has ended. Combine it with the partial chunk
1239 * if any; push it on the stack; enter next state.
1241 add_attribute (context, g_string_free (context->partial_chunk, FALSE));
1243 context->partial_chunk = NULL;
1244 context->start = NULL;
1246 if (*context->iter == '=')
1248 advance_char (context);
1249 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1256 G_MARKUP_ERROR_PARSE,
1257 _("Odd character '%s', expected a '=' after "
1258 "attribute name '%s' of element '%s'"),
1259 utf8_str (context->iter, buf),
1260 current_attribute (context),
1261 current_element (context));
1267 case STATE_BETWEEN_ATTRIBUTES:
1268 /* Possible next states: AFTER_CLOSE_ANGLE,
1269 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1271 skip_spaces (context);
1273 if (context->iter != context->current_text_end)
1275 if (*context->iter == '/')
1277 advance_char (context);
1278 context->state = STATE_AFTER_ELISION_SLASH;
1280 else if (*context->iter == '>')
1283 advance_char (context);
1284 context->state = STATE_AFTER_CLOSE_ANGLE;
1286 else if (is_name_start_char (g_utf8_get_char (context->iter)))
1288 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1289 /* start of attribute name */
1290 context->start = context->iter;
1297 G_MARKUP_ERROR_PARSE,
1298 _("Odd character '%s', expected a '>' or '/' "
1299 "character to end the start tag of "
1300 "element '%s', or optionally an attribute; "
1301 "perhaps you used an invalid character in "
1302 "an attribute name"),
1303 utf8_str (context->iter, buf),
1304 current_element (context));
1307 /* If we're done with attributes, invoke
1308 * the start_element callback
1310 if (context->state == STATE_AFTER_ELISION_SLASH ||
1311 context->state == STATE_AFTER_CLOSE_ANGLE)
1313 const gchar *start_name;
1314 /* Ugly, but the current code expects an empty array instead of NULL */
1315 const gchar *empty = NULL;
1316 const gchar **attr_names = ∅
1317 const gchar **attr_values = ∅
1320 /* Call user callback for element start */
1321 start_name = current_element (context);
1323 if (context->cur_attr >= 0)
1325 attr_names = (const gchar**)context->attr_names;
1326 attr_values = (const gchar**)context->attr_values;
1330 if (context->parser->start_element)
1331 (* context->parser->start_element) (context,
1333 (const gchar **)attr_names,
1334 (const gchar **)attr_values,
1338 /* Go ahead and free the attributes. */
1339 for (; context->cur_attr >= 0; context->cur_attr--)
1341 int pos = context->cur_attr;
1342 g_free (context->attr_names[pos]);
1343 g_free (context->attr_values[pos]);
1344 context->attr_names[pos] = context->attr_values[pos] = NULL;
1346 g_assert (context->cur_attr == -1);
1347 g_assert (context->attr_names == NULL ||
1348 context->attr_names[0] == NULL);
1349 g_assert (context->attr_values == NULL ||
1350 context->attr_values[0] == NULL);
1352 if (tmp_error != NULL)
1354 mark_error (context, tmp_error);
1355 g_propagate_error (error, tmp_error);
1361 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1362 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1364 skip_spaces (context);
1366 if (context->iter != context->current_text_end)
1368 if (*context->iter == '"')
1370 advance_char (context);
1371 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1372 context->start = context->iter;
1374 else if (*context->iter == '\'')
1376 advance_char (context);
1377 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1378 context->start = context->iter;
1385 G_MARKUP_ERROR_PARSE,
1386 _("Odd character '%s', expected an open quote mark "
1387 "after the equals sign when giving value for "
1388 "attribute '%s' of element '%s'"),
1389 utf8_str (context->iter, buf),
1390 current_attribute (context),
1391 current_element (context));
1396 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1397 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1398 /* Possible next states: BETWEEN_ATTRIBUTES */
1402 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1413 if (*context->iter == delim)
1416 while (advance_char (context));
1418 if (context->iter == context->current_text_end)
1420 /* The value hasn't necessarily ended. Merge with
1421 * partial chunk, leave state unchanged.
1423 add_to_partial (context, context->start, context->iter);
1427 /* The value has ended at the quote mark. Combine it
1428 * with the partial chunk if any; set it for the current
1433 add_to_partial (context, context->start, context->iter);
1435 g_assert (context->cur_attr >= 0);
1437 if (unescape_text (context,
1438 context->partial_chunk->str,
1439 context->partial_chunk->str +
1440 context->partial_chunk->len,
1444 /* success, advance past quote and set state. */
1445 context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
1446 advance_char (context);
1447 context->state = STATE_BETWEEN_ATTRIBUTES;
1448 context->start = NULL;
1451 truncate_partial (context);
1455 case STATE_INSIDE_TEXT:
1456 /* Possible next states: AFTER_OPEN_ANGLE */
1459 if (*context->iter == '<')
1462 while (advance_char (context));
1464 /* The text hasn't necessarily ended. Merge with
1465 * partial chunk, leave state unchanged.
1468 add_to_partial (context, context->start, context->iter);
1470 if (context->iter != context->current_text_end)
1472 GString *unescaped = NULL;
1474 /* The text has ended at the open angle. Call the text
1478 if (unescape_text (context,
1479 context->partial_chunk->str,
1480 context->partial_chunk->str +
1481 context->partial_chunk->len,
1485 GError *tmp_error = NULL;
1487 if (context->parser->text)
1488 (*context->parser->text) (context,
1494 g_string_free (unescaped, TRUE);
1496 if (tmp_error == NULL)
1498 /* advance past open angle and set state. */
1499 advance_char (context);
1500 context->state = STATE_AFTER_OPEN_ANGLE;
1501 /* could begin a passthrough */
1502 context->start = context->iter;
1506 mark_error (context, tmp_error);
1507 g_propagate_error (error, tmp_error);
1511 truncate_partial (context);
1515 case STATE_AFTER_CLOSE_TAG_SLASH:
1516 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1517 if (is_name_start_char (g_utf8_get_char (context->iter)))
1519 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1521 /* start of tag name */
1522 context->start = context->iter;
1529 G_MARKUP_ERROR_PARSE,
1530 _("'%s' is not a valid character following "
1531 "the characters '</'; '%s' may not begin an "
1533 utf8_str (context->iter, buf),
1534 utf8_str (context->iter, buf));
1538 case STATE_INSIDE_CLOSE_TAG_NAME:
1539 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1540 advance_to_name_end (context);
1541 add_to_partial (context, context->start, context->iter);
1543 if (context->iter != context->current_text_end)
1544 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1547 case STATE_AFTER_CLOSE_TAG_NAME:
1548 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1550 skip_spaces (context);
1552 if (context->iter != context->current_text_end)
1556 /* The name has ended. Combine it with the partial chunk
1557 * if any; check that it matches stack top and pop
1558 * stack; invoke proper callback; enter next state.
1560 close_name = g_string_free (context->partial_chunk, FALSE);
1561 context->partial_chunk = NULL;
1563 if (*context->iter != '>')
1568 G_MARKUP_ERROR_PARSE,
1569 _("'%s' is not a valid character following "
1570 "the close element name '%s'; the allowed "
1571 "character is '>'"),
1572 utf8_str (context->iter, buf),
1575 else if (context->tag_stack == NULL)
1579 G_MARKUP_ERROR_PARSE,
1580 _("Element '%s' was closed, no element "
1581 "is currently open"),
1584 else if (strcmp (close_name, current_element (context)) != 0)
1588 G_MARKUP_ERROR_PARSE,
1589 _("Element '%s' was closed, but the currently "
1590 "open element is '%s'"),
1592 current_element (context));
1597 advance_char (context);
1598 context->state = STATE_AFTER_CLOSE_ANGLE;
1599 context->start = NULL;
1601 /* call the end_element callback */
1603 if (context->parser->end_element)
1604 (* context->parser->end_element) (context,
1610 /* Pop the tag stack */
1611 g_free (context->tag_stack->data);
1612 context->tag_stack = g_slist_delete_link (context->tag_stack,
1613 context->tag_stack);
1617 mark_error (context, tmp_error);
1618 g_propagate_error (error, tmp_error);
1622 g_free (close_name);
1626 case STATE_INSIDE_PASSTHROUGH:
1627 /* Possible next state: AFTER_CLOSE_ANGLE */
1630 if (*context->iter == '<')
1632 if (*context->iter == '>')
1635 add_to_partial (context, context->start, context->iter);
1636 context->start = context->iter;
1637 if ((g_str_has_prefix (context->partial_chunk->str, "<?")
1638 && g_str_has_suffix (context->partial_chunk->str, "?")) ||
1639 (g_str_has_prefix (context->partial_chunk->str, "<!--")
1640 && g_str_has_suffix (context->partial_chunk->str, "--")) ||
1641 (g_str_has_prefix (context->partial_chunk->str, "<![CDATA[")
1642 && g_str_has_suffix (context->partial_chunk->str, "]]")) ||
1643 (g_str_has_prefix (context->partial_chunk->str, "<!DOCTYPE")
1644 && context->balance == 0))
1648 while (advance_char (context));
1650 if (context->iter == context->current_text_end)
1652 /* The passthrough hasn't necessarily ended. Merge with
1653 * partial chunk, leave state unchanged.
1655 add_to_partial (context, context->start, context->iter);
1659 /* The passthrough has ended at the close angle. Combine
1660 * it with the partial chunk if any. Call the passthrough
1661 * callback. Note that the open/close angles are
1662 * included in the text of the passthrough.
1664 GError *tmp_error = NULL;
1666 advance_char (context); /* advance past close angle */
1667 add_to_partial (context, context->start, context->iter);
1669 if (context->parser->passthrough)
1670 (*context->parser->passthrough) (context,
1671 context->partial_chunk->str,
1672 context->partial_chunk->len,
1676 truncate_partial (context);
1678 if (tmp_error == NULL)
1680 context->state = STATE_AFTER_CLOSE_ANGLE;
1681 context->start = context->iter; /* could begin text */
1685 mark_error (context, tmp_error);
1686 g_propagate_error (error, tmp_error);
1696 g_assert_not_reached ();
1702 context->parsing = FALSE;
1704 return context->state != STATE_ERROR;
1708 * g_markup_parse_context_end_parse:
1709 * @context: a #GMarkupParseContext
1710 * @error: return location for a #GError
1712 * Signals to the #GMarkupParseContext that all data has been
1713 * fed into the parse context with g_markup_parse_context_parse().
1714 * This function reports an error if the document isn't complete,
1715 * for example if elements are still open.
1717 * Return value: %TRUE on success, %FALSE if an error was set
1720 g_markup_parse_context_end_parse (GMarkupParseContext *context,
1723 g_return_val_if_fail (context != NULL, FALSE);
1724 g_return_val_if_fail (!context->parsing, FALSE);
1725 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
1727 if (context->partial_chunk != NULL)
1729 g_string_free (context->partial_chunk, TRUE);
1730 context->partial_chunk = NULL;
1733 if (context->document_empty)
1735 set_error (context, error, G_MARKUP_ERROR_EMPTY,
1736 _("Document was empty or contained only whitespace"));
1740 context->parsing = TRUE;
1742 switch (context->state)
1748 case STATE_AFTER_OPEN_ANGLE:
1749 set_error (context, error, G_MARKUP_ERROR_PARSE,
1750 _("Document ended unexpectedly just after an open angle bracket '<'"));
1753 case STATE_AFTER_CLOSE_ANGLE:
1754 if (context->tag_stack != NULL)
1756 /* Error message the same as for INSIDE_TEXT */
1757 set_error (context, error, G_MARKUP_ERROR_PARSE,
1758 _("Document ended unexpectedly with elements still open - "
1759 "'%s' was the last element opened"),
1760 current_element (context));
1764 case STATE_AFTER_ELISION_SLASH:
1765 set_error (context, error, G_MARKUP_ERROR_PARSE,
1766 _("Document ended unexpectedly, expected to see a close angle "
1767 "bracket ending the tag <%s/>"), current_element (context));
1770 case STATE_INSIDE_OPEN_TAG_NAME:
1771 set_error (context, error, G_MARKUP_ERROR_PARSE,
1772 _("Document ended unexpectedly inside an element name"));
1775 case STATE_INSIDE_ATTRIBUTE_NAME:
1776 set_error (context, error, G_MARKUP_ERROR_PARSE,
1777 _("Document ended unexpectedly inside an attribute name"));
1780 case STATE_BETWEEN_ATTRIBUTES:
1781 set_error (context, error, G_MARKUP_ERROR_PARSE,
1782 _("Document ended unexpectedly inside an element-opening "
1786 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1787 set_error (context, error, G_MARKUP_ERROR_PARSE,
1788 _("Document ended unexpectedly after the equals sign "
1789 "following an attribute name; no attribute value"));
1792 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1793 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1794 set_error (context, error, G_MARKUP_ERROR_PARSE,
1795 _("Document ended unexpectedly while inside an attribute "
1799 case STATE_INSIDE_TEXT:
1800 g_assert (context->tag_stack != NULL);
1801 set_error (context, error, G_MARKUP_ERROR_PARSE,
1802 _("Document ended unexpectedly with elements still open - "
1803 "'%s' was the last element opened"),
1804 current_element (context));
1807 case STATE_AFTER_CLOSE_TAG_SLASH:
1808 case STATE_INSIDE_CLOSE_TAG_NAME:
1809 set_error (context, error, G_MARKUP_ERROR_PARSE,
1810 _("Document ended unexpectedly inside the close tag for "
1811 "element '%s'"), current_element);
1814 case STATE_INSIDE_PASSTHROUGH:
1815 set_error (context, error, G_MARKUP_ERROR_PARSE,
1816 _("Document ended unexpectedly inside a comment or "
1817 "processing instruction"));
1822 g_assert_not_reached ();
1826 context->parsing = FALSE;
1828 return context->state != STATE_ERROR;
1832 * g_markup_parse_context_get_element:
1833 * @context: a #GMarkupParseContext
1834 * @returns: the name of the currently open element, or %NULL
1836 * Retrieves the name of the currently open element.
1840 G_CONST_RETURN gchar *
1841 g_markup_parse_context_get_element (GMarkupParseContext *context)
1843 g_return_val_if_fail (context != NULL, NULL);
1845 if (context->tag_stack == NULL)
1848 return current_element (context);
1852 * g_markup_parse_context_get_position:
1853 * @context: a #GMarkupParseContext
1854 * @line_number: return location for a line number, or %NULL
1855 * @char_number: return location for a char-on-line number, or %NULL
1857 * Retrieves the current line number and the number of the character on
1858 * that line. Intended for use in error messages; there are no strict
1859 * semantics for what constitutes the "current" line number other than
1860 * "the best number we could come up with for error messages."
1864 g_markup_parse_context_get_position (GMarkupParseContext *context,
1868 g_return_if_fail (context != NULL);
1871 *line_number = context->line_number;
1874 *char_number = context->char_number;
1878 append_escaped_text (GString *str,
1886 end = text + length;
1891 next = g_utf8_next_char (p);
1896 g_string_append (str, "&");
1900 g_string_append (str, "<");
1904 g_string_append (str, ">");
1908 g_string_append (str, "'");
1912 g_string_append (str, """);
1916 g_string_append_len (str, p, next - p);
1925 * g_markup_escape_text:
1926 * @text: some valid UTF-8 text
1927 * @length: length of @text in bytes
1929 * Escapes text so that the markup parser will parse it verbatim.
1930 * Less than, greater than, ampersand, etc. are replaced with the
1931 * corresponding entities. This function would typically be used
1932 * when writing out a file to be parsed with the markup parser.
1934 * Note that this function doesn't protect whitespace and line endings
1935 * from being processed according to the XML rules for normalization
1936 * of line endings and attribute values.
1938 * Return value: escaped text
1941 g_markup_escape_text (const gchar *text,
1946 g_return_val_if_fail (text != NULL, NULL);
1949 length = strlen (text);
1951 /* prealloc at least as long as original text */
1952 str = g_string_sized_new (length);
1953 append_escaped_text (str, text, length);
1955 return g_string_free (str, FALSE);
1960 * @format: a printf-style format string
1961 * @after: location to store a pointer to the character after
1962 * the returned conversion. On a %NULL return, returns the
1963 * pointer to the trailing NUL in the string
1965 * Find the next conversion in a printf-style format string.
1966 * Partially based on code from printf-parser.c,
1967 * Copyright (C) 1999-2000, 2002-2003 Free Software Foundation, Inc.
1969 * Return value: pointer to the next conversion in @format,
1970 * or %NULL, if none.
1973 find_conversion (const char *format,
1976 const char *start = format;
1979 while (*start != '\0' && *start != '%')
1996 /* Test for positional argument. */
1997 if (*cp >= '0' && *cp <= '9')
2001 for (np = cp; *np >= '0' && *np <= '9'; np++)
2007 /* Skip the flags. */
2021 /* Skip the field width. */
2026 /* Test for positional argument. */
2027 if (*cp >= '0' && *cp <= '9')
2031 for (np = cp; *np >= '0' && *np <= '9'; np++)
2039 for (; *cp >= '0' && *cp <= '9'; cp++)
2043 /* Skip the precision. */
2049 /* Test for positional argument. */
2050 if (*cp >= '0' && *cp <= '9')
2054 for (np = cp; *np >= '0' && *np <= '9'; np++)
2062 for (; *cp >= '0' && *cp <= '9'; cp++)
2067 /* Skip argument type/size specifiers. */
2068 while (*cp == 'h' ||
2077 /* Skip the conversion character. */
2085 * g_markup_vprintf_escaped:
2086 * @format: printf() style format string
2087 * @args: variable argument list, similar to vprintf()
2089 * Formats the data in @args according to @format, escaping
2090 * all string and character arguments in the fashion
2091 * of g_markup_escape_text(). See g_markup_printf_escaped().
2093 * Return value: newly allocated result from formatting
2094 * operation. Free with g_free().
2099 g_markup_vprintf_escaped (const char *format,
2104 GString *result = NULL;
2105 gchar *output1 = NULL;
2106 gchar *output2 = NULL;
2107 const char *p, *op1, *op2;
2110 /* The technique here, is that we make two format strings that
2111 * have the identical conversions in the identical order to the
2112 * original strings, but differ in the text in-between. We
2113 * then use the normal g_strdup_vprintf() to format the arguments
2114 * with the two new format strings. By comparing the results,
2115 * we can figure out what segments of the output come from
2116 * the the original format string, and what from the arguments,
2117 * and thus know what portions of the string to escape.
2119 * For instance, for:
2121 * g_markup_printf_escaped ("%s ate %d apples", "Susan & Fred", 5);
2123 * We form the two format strings "%sX%dX" and %sY%sY". The results
2124 * of formatting with those two strings are
2126 * "%sX%dX" => "Susan & FredX5X"
2127 * "%sY%dY" => "Susan & FredY5Y"
2129 * To find the span of the first argument, we find the first position
2130 * where the two arguments differ, which tells us that the first
2131 * argument formatted to "Susan & Fred". We then escape that
2132 * to "Susan & Fred" and join up with the intermediate portions
2133 * of the format string and the second argument to get
2134 * "Susan & Fred ate 5 apples".
2137 /* Create the two modified format strings
2139 format1 = g_string_new (NULL);
2140 format2 = g_string_new (NULL);
2145 const char *conv = find_conversion (p, &after);
2149 g_string_append_len (format1, conv, after - conv);
2150 g_string_append_c (format1, 'X');
2151 g_string_append_len (format2, conv, after - conv);
2152 g_string_append_c (format2, 'Y');
2157 /* Use them to format the arguments
2159 G_VA_COPY (args2, args);
2161 output1 = g_strdup_vprintf (format1->str, args);
2166 output2 = g_strdup_vprintf (format2->str, args2);
2171 result = g_string_new (NULL);
2173 /* Iterate through the original format string again,
2174 * copying the non-conversion portions and the escaped
2175 * converted arguments to the output string.
2183 const char *output_start;
2184 const char *conv = find_conversion (p, &after);
2187 if (!conv) /* The end, after points to the trailing \0 */
2189 g_string_append_len (result, p, after - p);
2193 g_string_append_len (result, p, conv - p);
2195 while (*op1 == *op2)
2201 escaped = g_markup_escape_text (output_start, op1 - output_start);
2202 g_string_append (result, escaped);
2211 g_string_free (format1, TRUE);
2212 g_string_free (format2, TRUE);
2217 return g_string_free (result, FALSE);
2223 * g_markup_printf_escaped:
2224 * @format: printf() style format string
2225 * @Varargs: the arguments to insert in the format string
2227 * Formats arguments according to @format, escaping
2228 * all string and character arguments in the fashion
2229 * of g_markup_escape_text(). This is useful when you
2230 * want to insert literal strings into XML-style markup
2231 * output, without having to worry that the strings
2232 * might themselves contain markup.
2234 * <informalexample><programlisting>
2235 * const char *store = "Fortnum & Mason";
2236 * const char *item = "Tea";
2239 * output = g_markup_printf_escaped ("<purchase>"
2240 * "<store>%s</store>"
2241 * "<item>%s</item>"
2242 * "</purchase>",
2244 * </programlisting></informalexample>
2246 * Return value: newly allocated result from formatting
2247 * operation. Free with g_free().
2252 g_markup_printf_escaped (const char *format, ...)
2257 va_start (args, format);
2258 result = g_markup_vprintf_escaped (format, args);