1 /* gmarkup.c - Simple XML-like parser
3 * Copyright 2000, 2003 Red Hat, Inc.
5 * GLib is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * GLib is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with GLib; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
35 g_markup_error_quark (void)
37 return g_quark_from_static_string ("g-markup-error-quark");
43 STATE_AFTER_OPEN_ANGLE,
44 STATE_AFTER_CLOSE_ANGLE,
45 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
46 STATE_INSIDE_OPEN_TAG_NAME,
47 STATE_INSIDE_ATTRIBUTE_NAME,
48 STATE_AFTER_ATTRIBUTE_NAME,
49 STATE_BETWEEN_ATTRIBUTES,
50 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
51 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
52 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
54 STATE_AFTER_CLOSE_TAG_SLASH,
55 STATE_INSIDE_CLOSE_TAG_NAME,
56 STATE_AFTER_CLOSE_TAG_NAME,
57 STATE_INSIDE_PASSTHROUGH,
61 struct _GMarkupParseContext
63 const GMarkupParser *parser;
65 GMarkupParseFlags flags;
71 GDestroyNotify dnotify;
73 /* A piece of character data or an element that
74 * hasn't "ended" yet so we haven't yet called
75 * the callback for it.
77 GString *partial_chunk;
79 GMarkupParseState state;
86 const gchar *current_text;
87 gssize current_text_len;
88 const gchar *current_text_end;
90 GString *leftover_char_portion;
92 /* used to save the start of the last interesting thingy */
97 guint document_empty : 1;
103 * g_markup_parse_context_new:
104 * @parser: a #GMarkupParser
105 * @flags: one or more #GMarkupParseFlags
106 * @user_data: user data to pass to #GMarkupParser functions
107 * @user_data_dnotify: user data destroy notifier called when the parse context is freed
109 * Creates a new parse context. A parse context is used to parse
110 * marked-up documents. You can feed any number of documents into
111 * a context, as long as no errors occur; once an error occurs,
112 * the parse context can't continue to parse text (you have to free it
113 * and create a new parse context).
115 * Return value: a new #GMarkupParseContext
117 GMarkupParseContext *
118 g_markup_parse_context_new (const GMarkupParser *parser,
119 GMarkupParseFlags flags,
121 GDestroyNotify user_data_dnotify)
123 GMarkupParseContext *context;
125 g_return_val_if_fail (parser != NULL, NULL);
127 context = g_new (GMarkupParseContext, 1);
129 context->parser = parser;
130 context->flags = flags;
131 context->user_data = user_data;
132 context->dnotify = user_data_dnotify;
134 context->line_number = 1;
135 context->char_number = 1;
137 context->partial_chunk = NULL;
139 context->state = STATE_START;
140 context->tag_stack = NULL;
141 context->attr_names = NULL;
142 context->attr_values = NULL;
143 context->cur_attr = -1;
144 context->alloc_attrs = 0;
146 context->current_text = NULL;
147 context->current_text_len = -1;
148 context->current_text_end = NULL;
149 context->leftover_char_portion = NULL;
151 context->start = NULL;
152 context->iter = NULL;
154 context->document_empty = TRUE;
155 context->parsing = FALSE;
157 context->balance = 0;
163 * g_markup_parse_context_free:
164 * @context: a #GMarkupParseContext
166 * Frees a #GMarkupParseContext. Can't be called from inside
167 * one of the #GMarkupParser functions.
171 g_markup_parse_context_free (GMarkupParseContext *context)
173 g_return_if_fail (context != NULL);
174 g_return_if_fail (!context->parsing);
176 if (context->dnotify)
177 (* context->dnotify) (context->user_data);
179 g_strfreev (context->attr_names);
180 g_strfreev (context->attr_values);
182 g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
183 g_slist_free (context->tag_stack);
185 if (context->partial_chunk)
186 g_string_free (context->partial_chunk, TRUE);
188 if (context->leftover_char_portion)
189 g_string_free (context->leftover_char_portion, TRUE);
195 mark_error (GMarkupParseContext *context,
198 context->state = STATE_ERROR;
200 if (context->parser->error)
201 (*context->parser->error) (context, error, context->user_data);
204 static void set_error (GMarkupParseContext *context,
208 ...) G_GNUC_PRINTF (4, 5);
211 set_error (GMarkupParseContext *context,
221 va_start (args, format);
222 s = g_strdup_vprintf (format, args);
225 tmp_error = g_error_new (G_MARKUP_ERROR,
227 _("Error on line %d char %d: %s"),
228 context->line_number,
229 context->char_number,
234 mark_error (context, tmp_error);
236 g_propagate_error (error, tmp_error);
240 /* To make these faster, we first use the ascii-only tests, then check
241 * for the usual non-alnum name-end chars, and only then call the
242 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
243 * names, so this is a reasonable hack that virtually always avoids
246 #define IS_COMMON_NAME_END_CHAR(c) \
247 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
250 is_name_start_char (const gchar *p)
252 if (g_ascii_isalpha (*p) ||
253 (!IS_COMMON_NAME_END_CHAR (*p) &&
256 g_unichar_isalpha (g_utf8_get_char (p)))))
263 is_name_char (const gchar *p)
265 if (g_ascii_isalnum (*p) ||
266 (!IS_COMMON_NAME_END_CHAR (*p) &&
271 g_unichar_isalpha (g_utf8_get_char (p)))))
279 char_str (gunichar c,
283 g_unichar_to_utf8 (c, buf);
288 utf8_str (const gchar *utf8,
291 char_str (g_utf8_get_char (utf8), buf);
296 set_unescape_error (GMarkupParseContext *context,
298 const gchar *remaining_text,
299 const gchar *remaining_text_end,
307 gint remaining_newlines;
310 remaining_newlines = 0;
312 while (p != remaining_text_end)
315 ++remaining_newlines;
319 va_start (args, format);
320 s = g_strdup_vprintf (format, args);
323 tmp_error = g_error_new (G_MARKUP_ERROR,
325 _("Error on line %d: %s"),
326 context->line_number - remaining_newlines,
331 mark_error (context, tmp_error);
333 g_propagate_error (error, tmp_error);
339 USTATE_AFTER_AMPERSAND,
340 USTATE_INSIDE_ENTITY_NAME,
341 USTATE_AFTER_CHARREF_HASH
346 GMarkupParseContext *context;
350 const gchar *text_end;
351 const gchar *entity_start;
355 unescape_text_state_inside_text (UnescapeContext *ucontext,
360 gboolean normalize_attribute;
362 if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
363 ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
364 normalize_attribute = TRUE;
366 normalize_attribute = FALSE;
370 while (p != ucontext->text_end)
376 else if (normalize_attribute && (*p == '\t' || *p == '\n'))
378 g_string_append_len (ucontext->str, start, p - start);
379 g_string_append_c (ucontext->str, ' ');
380 p = g_utf8_next_char (p);
385 g_string_append_len (ucontext->str, start, p - start);
386 g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
387 p = g_utf8_next_char (p);
388 if (p != ucontext->text_end && *p == '\n')
389 p = g_utf8_next_char (p);
393 p = g_utf8_next_char (p);
397 g_string_append_len (ucontext->str, start, p - start);
399 if (p != ucontext->text_end && *p == '&')
401 p = g_utf8_next_char (p);
402 ucontext->state = USTATE_AFTER_AMPERSAND;
409 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
413 ucontext->entity_start = NULL;
417 p = g_utf8_next_char (p);
419 ucontext->entity_start = p;
420 ucontext->state = USTATE_AFTER_CHARREF_HASH;
422 else if (!is_name_start_char (p))
426 set_unescape_error (ucontext->context, error,
427 p, ucontext->text_end,
428 G_MARKUP_ERROR_PARSE,
429 _("Empty entity '&;' seen; valid "
430 "entities are: & " < > '"));
436 set_unescape_error (ucontext->context, error,
437 p, ucontext->text_end,
438 G_MARKUP_ERROR_PARSE,
439 _("Character '%s' is not valid at "
440 "the start of an entity name; "
441 "the & character begins an entity; "
442 "if this ampersand isn't supposed "
443 "to be an entity, escape it as "
450 ucontext->entity_start = p;
451 ucontext->state = USTATE_INSIDE_ENTITY_NAME;
458 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
462 while (p != ucontext->text_end)
466 else if (!is_name_char (p))
470 set_unescape_error (ucontext->context, error,
471 p, ucontext->text_end,
472 G_MARKUP_ERROR_PARSE,
473 _("Character '%s' is not valid "
474 "inside an entity name"),
479 p = g_utf8_next_char (p);
482 if (ucontext->context->state != STATE_ERROR)
484 if (p != ucontext->text_end)
486 gint len = p - ucontext->entity_start;
488 /* move to after semicolon */
489 p = g_utf8_next_char (p);
490 ucontext->state = USTATE_INSIDE_TEXT;
492 if (strncmp (ucontext->entity_start, "lt", len) == 0)
493 g_string_append_c (ucontext->str, '<');
494 else if (strncmp (ucontext->entity_start, "gt", len) == 0)
495 g_string_append_c (ucontext->str, '>');
496 else if (strncmp (ucontext->entity_start, "amp", len) == 0)
497 g_string_append_c (ucontext->str, '&');
498 else if (strncmp (ucontext->entity_start, "quot", len) == 0)
499 g_string_append_c (ucontext->str, '"');
500 else if (strncmp (ucontext->entity_start, "apos", len) == 0)
501 g_string_append_c (ucontext->str, '\'');
506 name = g_strndup (ucontext->entity_start, len);
507 set_unescape_error (ucontext->context, error,
508 p, ucontext->text_end,
509 G_MARKUP_ERROR_PARSE,
510 _("Entity name '%s' is not known"),
517 set_unescape_error (ucontext->context, error,
518 /* give line number of the & */
519 ucontext->entity_start, ucontext->text_end,
520 G_MARKUP_ERROR_PARSE,
521 _("Entity did not end with a semicolon; "
522 "most likely you used an ampersand "
523 "character without intending to start "
524 "an entity - escape ampersand as &"));
533 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
537 gboolean is_hex = FALSE;
540 start = ucontext->entity_start;
545 p = g_utf8_next_char (p);
549 while (p != ucontext->text_end && *p != ';')
550 p = g_utf8_next_char (p);
552 if (p != ucontext->text_end)
554 g_assert (*p == ';');
556 /* digit is between start and p */
565 l = strtoul (start, &end, 16);
567 l = strtoul (start, &end, 10);
569 if (end != p || errno != 0)
571 set_unescape_error (ucontext->context, error,
572 start, ucontext->text_end,
573 G_MARKUP_ERROR_PARSE,
574 _("Failed to parse '%-.*s', which "
575 "should have been a digit "
576 "inside a character reference "
577 "(ê for example) - perhaps "
578 "the digit is too large"),
583 /* characters XML permits */
587 (l >= 0x20 && l <= 0xD7FF) ||
588 (l >= 0xE000 && l <= 0xFFFD) ||
589 (l >= 0x10000 && l <= 0x10FFFF))
592 g_string_append (ucontext->str, char_str (l, buf));
596 set_unescape_error (ucontext->context, error,
597 start, ucontext->text_end,
598 G_MARKUP_ERROR_PARSE,
599 _("Character reference '%-.*s' does not "
600 "encode a permitted character"),
605 /* Move to next state */
606 p = g_utf8_next_char (p); /* past semicolon */
607 ucontext->state = USTATE_INSIDE_TEXT;
611 set_unescape_error (ucontext->context, error,
612 start, ucontext->text_end,
613 G_MARKUP_ERROR_PARSE,
614 _("Empty character reference; "
615 "should include a digit such as "
621 set_unescape_error (ucontext->context, error,
622 start, ucontext->text_end,
623 G_MARKUP_ERROR_PARSE,
624 _("Character reference did not end with a "
626 "most likely you used an ampersand "
627 "character without intending to start "
628 "an entity - escape ampersand as &"));
635 unescape_text (GMarkupParseContext *context,
637 const gchar *text_end,
641 UnescapeContext ucontext;
644 ucontext.context = context;
645 ucontext.text = text;
646 ucontext.text_end = text_end;
647 ucontext.entity_start = NULL;
649 ucontext.str = g_string_sized_new (text_end - text);
651 ucontext.state = USTATE_INSIDE_TEXT;
654 while (p != text_end && context->state != STATE_ERROR)
656 g_assert (p < text_end);
658 switch (ucontext.state)
660 case USTATE_INSIDE_TEXT:
662 p = unescape_text_state_inside_text (&ucontext,
668 case USTATE_AFTER_AMPERSAND:
670 p = unescape_text_state_after_ampersand (&ucontext,
677 case USTATE_INSIDE_ENTITY_NAME:
679 p = unescape_text_state_inside_entity_name (&ucontext,
685 case USTATE_AFTER_CHARREF_HASH:
687 p = unescape_text_state_after_charref_hash (&ucontext,
694 g_assert_not_reached ();
699 if (context->state != STATE_ERROR)
701 switch (ucontext.state)
703 case USTATE_INSIDE_TEXT:
705 case USTATE_AFTER_AMPERSAND:
706 case USTATE_INSIDE_ENTITY_NAME:
707 set_unescape_error (context, error,
709 G_MARKUP_ERROR_PARSE,
710 _("Unfinished entity reference"));
712 case USTATE_AFTER_CHARREF_HASH:
713 set_unescape_error (context, error,
715 G_MARKUP_ERROR_PARSE,
716 _("Unfinished character reference"));
721 if (context->state == STATE_ERROR)
723 g_string_free (ucontext.str, TRUE);
729 *unescaped = ucontext.str;
734 static inline gboolean
735 advance_char (GMarkupParseContext *context)
737 context->iter = g_utf8_next_char (context->iter);
738 context->char_number += 1;
740 if (context->iter == context->current_text_end)
744 else if (*context->iter == '\n')
746 context->line_number += 1;
747 context->char_number = 1;
753 static inline gboolean
756 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
760 skip_spaces (GMarkupParseContext *context)
764 if (!xml_isspace (*context->iter))
767 while (advance_char (context));
771 advance_to_name_end (GMarkupParseContext *context)
775 if (!is_name_char (context->iter))
778 while (advance_char (context));
782 add_to_partial (GMarkupParseContext *context,
783 const gchar *text_start,
784 const gchar *text_end)
786 if (context->partial_chunk == NULL)
787 context->partial_chunk = g_string_sized_new (text_end - text_start);
789 if (text_start != text_end)
790 g_string_append_len (context->partial_chunk, text_start,
791 text_end - text_start);
793 /* Invariant here that partial_chunk exists */
797 truncate_partial (GMarkupParseContext *context)
799 if (context->partial_chunk != NULL)
801 context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
806 current_element (GMarkupParseContext *context)
808 return context->tag_stack->data;
812 current_attribute (GMarkupParseContext *context)
814 g_assert (context->cur_attr >= 0);
815 return context->attr_names[context->cur_attr];
819 find_current_text_end (GMarkupParseContext *context)
821 /* This function must be safe (non-segfaulting) on invalid UTF8.
822 * It assumes the string starts with a character start
824 const gchar *end = context->current_text + context->current_text_len;
828 g_assert (context->current_text_len > 0);
830 p = g_utf8_find_prev_char (context->current_text, end);
832 g_assert (p != NULL); /* since current_text was a char start */
834 /* p is now the start of the last character or character portion. */
836 next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
840 /* whole character */
841 context->current_text_end = end;
846 context->leftover_char_portion = g_string_new_len (p, end - p);
847 context->current_text_len -= (end - p);
848 context->current_text_end = p;
854 add_attribute (GMarkupParseContext *context, char *name)
856 if (context->cur_attr + 2 >= context->alloc_attrs)
858 context->alloc_attrs += 5; /* silly magic number */
859 context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
860 context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
863 context->attr_names[context->cur_attr] = name;
864 context->attr_values[context->cur_attr] = NULL;
865 context->attr_names[context->cur_attr+1] = NULL;
866 context->attr_values[context->cur_attr+1] = NULL;
870 * g_markup_parse_context_parse:
871 * @context: a #GMarkupParseContext
872 * @text: chunk of text to parse
873 * @text_len: length of @text in bytes
874 * @error: return location for a #GError
876 * Feed some data to the #GMarkupParseContext. The data need not
877 * be valid UTF-8; an error will be signaled if it's invalid.
878 * The data need not be an entire document; you can feed a document
879 * into the parser incrementally, via multiple calls to this function.
880 * Typically, as you receive data from a network connection or file,
881 * you feed each received chunk of data into this function, aborting
882 * the process if an error occurs. Once an error is reported, no further
883 * data may be fed to the #GMarkupParseContext; all errors are fatal.
885 * Return value: %FALSE if an error occurred, %TRUE on success
888 g_markup_parse_context_parse (GMarkupParseContext *context,
893 const gchar *first_invalid;
895 g_return_val_if_fail (context != NULL, FALSE);
896 g_return_val_if_fail (text != NULL, FALSE);
897 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
898 g_return_val_if_fail (!context->parsing, FALSE);
901 text_len = strlen (text);
906 context->parsing = TRUE;
908 if (context->leftover_char_portion)
910 const gchar *first_char;
912 if ((*text & 0xc0) != 0x80)
915 first_char = g_utf8_find_next_char (text, text + text_len);
919 /* leftover_char_portion was completed. Parse it. */
920 GString *portion = context->leftover_char_portion;
922 g_string_append_len (context->leftover_char_portion,
923 text, first_char - text);
925 /* hacks to allow recursion */
926 context->parsing = FALSE;
927 context->leftover_char_portion = NULL;
929 if (!g_markup_parse_context_parse (context,
930 portion->str, portion->len,
933 g_assert (context->state == STATE_ERROR);
936 g_string_free (portion, TRUE);
937 context->parsing = TRUE;
939 /* Skip the fraction of char that was in this text */
940 text_len -= (first_char - text);
945 /* another little chunk of the leftover char; geez
946 * someone is inefficient.
948 g_string_append_len (context->leftover_char_portion,
951 if (context->leftover_char_portion->len > 7)
953 /* The leftover char portion is too big to be
958 G_MARKUP_ERROR_BAD_UTF8,
959 _("Invalid UTF-8 encoded text"));
966 context->current_text = text;
967 context->current_text_len = text_len;
968 context->iter = context->current_text;
969 context->start = context->iter;
971 /* Nothing left after finishing the leftover char, or nothing
972 * passed in to begin with.
974 if (context->current_text_len == 0)
977 /* find_current_text_end () assumes the string starts at
978 * a character start, so we need to validate at least
979 * that much. It doesn't assume any following bytes
982 if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
986 G_MARKUP_ERROR_BAD_UTF8,
987 _("Invalid UTF-8 encoded text"));
991 /* Initialize context->current_text_end, possibly adjusting
992 * current_text_len, and add any leftover char portion
994 find_current_text_end (context);
996 /* Validate UTF8 (must be done after we find the end, since
997 * we could have a trailing incomplete char)
999 if (!g_utf8_validate (context->current_text,
1000 context->current_text_len,
1005 p = context->current_text;
1006 while (p != context->current_text_end)
1013 context->line_number += newlines;
1017 G_MARKUP_ERROR_BAD_UTF8,
1018 _("Invalid UTF-8 encoded text"));
1022 while (context->iter != context->current_text_end)
1024 switch (context->state)
1027 /* Possible next state: AFTER_OPEN_ANGLE */
1029 g_assert (context->tag_stack == NULL);
1031 /* whitespace is ignored outside of any elements */
1032 skip_spaces (context);
1034 if (context->iter != context->current_text_end)
1036 if (*context->iter == '<')
1038 /* Move after the open angle */
1039 advance_char (context);
1041 context->state = STATE_AFTER_OPEN_ANGLE;
1043 /* this could start a passthrough */
1044 context->start = context->iter;
1046 /* document is now non-empty */
1047 context->document_empty = FALSE;
1053 G_MARKUP_ERROR_PARSE,
1054 _("Document must begin with an element (e.g. <book>)"));
1059 case STATE_AFTER_OPEN_ANGLE:
1060 /* Possible next states: INSIDE_OPEN_TAG_NAME,
1061 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
1063 if (*context->iter == '?' ||
1064 *context->iter == '!')
1066 /* include < in the passthrough */
1067 const gchar *openangle = "<";
1068 add_to_partial (context, openangle, openangle + 1);
1069 context->start = context->iter;
1070 context->balance = 1;
1071 context->state = STATE_INSIDE_PASSTHROUGH;
1073 else if (*context->iter == '/')
1076 advance_char (context);
1078 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
1080 else if (is_name_start_char (context->iter))
1082 context->state = STATE_INSIDE_OPEN_TAG_NAME;
1084 /* start of tag name */
1085 context->start = context->iter;
1093 G_MARKUP_ERROR_PARSE,
1094 _("'%s' is not a valid character following "
1095 "a '<' character; it may not begin an "
1097 utf8_str (context->iter, buf));
1101 /* The AFTER_CLOSE_ANGLE state is actually sort of
1102 * broken, because it doesn't correspond to a range
1103 * of characters in the input stream as the others do,
1104 * and thus makes things harder to conceptualize
1106 case STATE_AFTER_CLOSE_ANGLE:
1107 /* Possible next states: INSIDE_TEXT, STATE_START */
1108 if (context->tag_stack == NULL)
1110 context->start = NULL;
1111 context->state = STATE_START;
1115 context->start = context->iter;
1116 context->state = STATE_INSIDE_TEXT;
1120 case STATE_AFTER_ELISION_SLASH:
1121 /* Possible next state: AFTER_CLOSE_ANGLE */
1124 /* We need to pop the tag stack and call the end_element
1125 * function, since this is the close tag
1127 GError *tmp_error = NULL;
1129 g_assert (context->tag_stack != NULL);
1132 if (context->parser->end_element)
1133 (* context->parser->end_element) (context,
1134 context->tag_stack->data,
1140 mark_error (context, tmp_error);
1141 g_propagate_error (error, tmp_error);
1145 if (*context->iter == '>')
1147 /* move after the close angle */
1148 advance_char (context);
1149 context->state = STATE_AFTER_CLOSE_ANGLE;
1157 G_MARKUP_ERROR_PARSE,
1158 _("Odd character '%s', expected a '>' character "
1159 "to end the start tag of element '%s'"),
1160 utf8_str (context->iter, buf),
1161 current_element (context));
1165 g_free (context->tag_stack->data);
1166 context->tag_stack = g_slist_delete_link (context->tag_stack,
1167 context->tag_stack);
1171 case STATE_INSIDE_OPEN_TAG_NAME:
1172 /* Possible next states: BETWEEN_ATTRIBUTES */
1174 /* if there's a partial chunk then it's the first part of the
1175 * tag name. If there's a context->start then it's the start
1176 * of the tag name in current_text, the partial chunk goes
1177 * before that start though.
1179 advance_to_name_end (context);
1181 if (context->iter == context->current_text_end)
1183 /* The name hasn't necessarily ended. Merge with
1184 * partial chunk, leave state unchanged.
1186 add_to_partial (context, context->start, context->iter);
1190 /* The name has ended. Combine it with the partial chunk
1191 * if any; push it on the stack; enter next state.
1193 add_to_partial (context, context->start, context->iter);
1194 context->tag_stack =
1195 g_slist_prepend (context->tag_stack,
1196 g_string_free (context->partial_chunk,
1199 context->partial_chunk = NULL;
1201 context->state = STATE_BETWEEN_ATTRIBUTES;
1202 context->start = NULL;
1206 case STATE_INSIDE_ATTRIBUTE_NAME:
1207 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1209 advance_to_name_end (context);
1210 add_to_partial (context, context->start, context->iter);
1212 /* read the full name, if we enter the equals sign state
1213 * then add the attribute to the list (without the value),
1214 * otherwise store a partial chunk to be prepended later.
1216 if (context->iter != context->current_text_end)
1217 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1220 case STATE_AFTER_ATTRIBUTE_NAME:
1221 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1223 skip_spaces (context);
1225 if (context->iter != context->current_text_end)
1227 /* The name has ended. Combine it with the partial chunk
1228 * if any; push it on the stack; enter next state.
1230 add_attribute (context, g_string_free (context->partial_chunk, FALSE));
1232 context->partial_chunk = NULL;
1233 context->start = NULL;
1235 if (*context->iter == '=')
1237 advance_char (context);
1238 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1246 G_MARKUP_ERROR_PARSE,
1247 _("Odd character '%s', expected a '=' after "
1248 "attribute name '%s' of element '%s'"),
1249 utf8_str (context->iter, buf),
1250 current_attribute (context),
1251 current_element (context));
1257 case STATE_BETWEEN_ATTRIBUTES:
1258 /* Possible next states: AFTER_CLOSE_ANGLE,
1259 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1261 skip_spaces (context);
1263 if (context->iter != context->current_text_end)
1265 if (*context->iter == '/')
1267 advance_char (context);
1268 context->state = STATE_AFTER_ELISION_SLASH;
1270 else if (*context->iter == '>')
1273 advance_char (context);
1274 context->state = STATE_AFTER_CLOSE_ANGLE;
1276 else if (is_name_start_char (context->iter))
1278 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1279 /* start of attribute name */
1280 context->start = context->iter;
1288 G_MARKUP_ERROR_PARSE,
1289 _("Odd character '%s', expected a '>' or '/' "
1290 "character to end the start tag of "
1291 "element '%s', or optionally an attribute; "
1292 "perhaps you used an invalid character in "
1293 "an attribute name"),
1294 utf8_str (context->iter, buf),
1295 current_element (context));
1298 /* If we're done with attributes, invoke
1299 * the start_element callback
1301 if (context->state == STATE_AFTER_ELISION_SLASH ||
1302 context->state == STATE_AFTER_CLOSE_ANGLE)
1304 const gchar *start_name;
1305 /* Ugly, but the current code expects an empty array instead of NULL */
1306 const gchar *empty = NULL;
1307 const gchar **attr_names = ∅
1308 const gchar **attr_values = ∅
1311 /* Call user callback for element start */
1312 start_name = current_element (context);
1314 if (context->cur_attr >= 0)
1316 attr_names = (const gchar**)context->attr_names;
1317 attr_values = (const gchar**)context->attr_values;
1321 if (context->parser->start_element)
1322 (* context->parser->start_element) (context,
1324 (const gchar **)attr_names,
1325 (const gchar **)attr_values,
1329 /* Go ahead and free the attributes. */
1330 for (; context->cur_attr >= 0; context->cur_attr--)
1332 int pos = context->cur_attr;
1333 g_free (context->attr_names[pos]);
1334 g_free (context->attr_values[pos]);
1335 context->attr_names[pos] = context->attr_values[pos] = NULL;
1337 g_assert (context->cur_attr == -1);
1338 g_assert (context->attr_names == NULL ||
1339 context->attr_names[0] == NULL);
1340 g_assert (context->attr_values == NULL ||
1341 context->attr_values[0] == NULL);
1343 if (tmp_error != NULL)
1345 mark_error (context, tmp_error);
1346 g_propagate_error (error, tmp_error);
1352 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1353 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1355 skip_spaces (context);
1357 if (context->iter != context->current_text_end)
1359 if (*context->iter == '"')
1361 advance_char (context);
1362 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1363 context->start = context->iter;
1365 else if (*context->iter == '\'')
1367 advance_char (context);
1368 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1369 context->start = context->iter;
1377 G_MARKUP_ERROR_PARSE,
1378 _("Odd character '%s', expected an open quote mark "
1379 "after the equals sign when giving value for "
1380 "attribute '%s' of element '%s'"),
1381 utf8_str (context->iter, buf),
1382 current_attribute (context),
1383 current_element (context));
1388 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1389 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1390 /* Possible next states: BETWEEN_ATTRIBUTES */
1394 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1405 if (*context->iter == delim)
1408 while (advance_char (context));
1410 if (context->iter == context->current_text_end)
1412 /* The value hasn't necessarily ended. Merge with
1413 * partial chunk, leave state unchanged.
1415 add_to_partial (context, context->start, context->iter);
1419 /* The value has ended at the quote mark. Combine it
1420 * with the partial chunk if any; set it for the current
1425 add_to_partial (context, context->start, context->iter);
1427 g_assert (context->cur_attr >= 0);
1429 if (unescape_text (context,
1430 context->partial_chunk->str,
1431 context->partial_chunk->str +
1432 context->partial_chunk->len,
1436 /* success, advance past quote and set state. */
1437 context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
1438 advance_char (context);
1439 context->state = STATE_BETWEEN_ATTRIBUTES;
1440 context->start = NULL;
1443 truncate_partial (context);
1447 case STATE_INSIDE_TEXT:
1448 /* Possible next states: AFTER_OPEN_ANGLE */
1451 if (*context->iter == '<')
1454 while (advance_char (context));
1456 /* The text hasn't necessarily ended. Merge with
1457 * partial chunk, leave state unchanged.
1460 add_to_partial (context, context->start, context->iter);
1462 if (context->iter != context->current_text_end)
1464 GString *unescaped = NULL;
1466 /* The text has ended at the open angle. Call the text
1470 if (unescape_text (context,
1471 context->partial_chunk->str,
1472 context->partial_chunk->str +
1473 context->partial_chunk->len,
1477 GError *tmp_error = NULL;
1479 if (context->parser->text)
1480 (*context->parser->text) (context,
1486 g_string_free (unescaped, TRUE);
1488 if (tmp_error == NULL)
1490 /* advance past open angle and set state. */
1491 advance_char (context);
1492 context->state = STATE_AFTER_OPEN_ANGLE;
1493 /* could begin a passthrough */
1494 context->start = context->iter;
1498 mark_error (context, tmp_error);
1499 g_propagate_error (error, tmp_error);
1503 truncate_partial (context);
1507 case STATE_AFTER_CLOSE_TAG_SLASH:
1508 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1509 if (is_name_start_char (context->iter))
1511 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1513 /* start of tag name */
1514 context->start = context->iter;
1522 G_MARKUP_ERROR_PARSE,
1523 _("'%s' is not a valid character following "
1524 "the characters '</'; '%s' may not begin an "
1526 utf8_str (context->iter, buf),
1527 utf8_str (context->iter, buf));
1531 case STATE_INSIDE_CLOSE_TAG_NAME:
1532 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1533 advance_to_name_end (context);
1534 add_to_partial (context, context->start, context->iter);
1536 if (context->iter != context->current_text_end)
1537 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1540 case STATE_AFTER_CLOSE_TAG_NAME:
1541 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1543 skip_spaces (context);
1545 if (context->iter != context->current_text_end)
1549 /* The name has ended. Combine it with the partial chunk
1550 * if any; check that it matches stack top and pop
1551 * stack; invoke proper callback; enter next state.
1553 close_name = g_string_free (context->partial_chunk, FALSE);
1554 context->partial_chunk = NULL;
1556 if (*context->iter != '>')
1562 G_MARKUP_ERROR_PARSE,
1563 _("'%s' is not a valid character following "
1564 "the close element name '%s'; the allowed "
1565 "character is '>'"),
1566 utf8_str (context->iter, buf),
1569 else if (context->tag_stack == NULL)
1573 G_MARKUP_ERROR_PARSE,
1574 _("Element '%s' was closed, no element "
1575 "is currently open"),
1578 else if (strcmp (close_name, current_element (context)) != 0)
1582 G_MARKUP_ERROR_PARSE,
1583 _("Element '%s' was closed, but the currently "
1584 "open element is '%s'"),
1586 current_element (context));
1591 advance_char (context);
1592 context->state = STATE_AFTER_CLOSE_ANGLE;
1593 context->start = NULL;
1595 /* call the end_element callback */
1597 if (context->parser->end_element)
1598 (* context->parser->end_element) (context,
1604 /* Pop the tag stack */
1605 g_free (context->tag_stack->data);
1606 context->tag_stack = g_slist_delete_link (context->tag_stack,
1607 context->tag_stack);
1611 mark_error (context, tmp_error);
1612 g_propagate_error (error, tmp_error);
1616 g_free (close_name);
1620 case STATE_INSIDE_PASSTHROUGH:
1621 /* Possible next state: AFTER_CLOSE_ANGLE */
1624 if (*context->iter == '<')
1626 if (*context->iter == '>')
1632 add_to_partial (context, context->start, context->iter);
1633 context->start = context->iter;
1635 str = context->partial_chunk->str;
1636 len = context->partial_chunk->len;
1638 if (str[1] == '?' && str[len - 1] == '?')
1640 if (strncmp (str, "<!--", 4) == 0 &&
1641 strcmp (str + len - 2, "--") == 0)
1643 if (strncmp (str, "<![CDATA[", 9) == 0 &&
1644 strcmp (str + len - 2, "]]") == 0)
1646 if (strncmp (str, "<!DOCTYPE", 9) == 0 &&
1647 context->balance == 0)
1651 while (advance_char (context));
1653 if (context->iter == context->current_text_end)
1655 /* The passthrough hasn't necessarily ended. Merge with
1656 * partial chunk, leave state unchanged.
1658 add_to_partial (context, context->start, context->iter);
1662 /* The passthrough has ended at the close angle. Combine
1663 * it with the partial chunk if any. Call the passthrough
1664 * callback. Note that the open/close angles are
1665 * included in the text of the passthrough.
1667 GError *tmp_error = NULL;
1669 advance_char (context); /* advance past close angle */
1670 add_to_partial (context, context->start, context->iter);
1672 if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT &&
1673 strncmp (context->partial_chunk->str, "<![CDATA[", 9) == 0)
1675 if (context->parser->text)
1676 (*context->parser->text) (context,
1677 context->partial_chunk->str + 9,
1678 context->partial_chunk->len - 12,
1682 else if (context->parser->passthrough)
1683 (*context->parser->passthrough) (context,
1684 context->partial_chunk->str,
1685 context->partial_chunk->len,
1689 truncate_partial (context);
1691 if (tmp_error == NULL)
1693 context->state = STATE_AFTER_CLOSE_ANGLE;
1694 context->start = context->iter; /* could begin text */
1698 mark_error (context, tmp_error);
1699 g_propagate_error (error, tmp_error);
1709 g_assert_not_reached ();
1715 context->parsing = FALSE;
1717 return context->state != STATE_ERROR;
1721 * g_markup_parse_context_end_parse:
1722 * @context: a #GMarkupParseContext
1723 * @error: return location for a #GError
1725 * Signals to the #GMarkupParseContext that all data has been
1726 * fed into the parse context with g_markup_parse_context_parse().
1727 * This function reports an error if the document isn't complete,
1728 * for example if elements are still open.
1730 * Return value: %TRUE on success, %FALSE if an error was set
1733 g_markup_parse_context_end_parse (GMarkupParseContext *context,
1736 g_return_val_if_fail (context != NULL, FALSE);
1737 g_return_val_if_fail (!context->parsing, FALSE);
1738 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
1740 if (context->partial_chunk != NULL)
1742 g_string_free (context->partial_chunk, TRUE);
1743 context->partial_chunk = NULL;
1746 if (context->document_empty)
1748 set_error (context, error, G_MARKUP_ERROR_EMPTY,
1749 _("Document was empty or contained only whitespace"));
1753 context->parsing = TRUE;
1755 switch (context->state)
1761 case STATE_AFTER_OPEN_ANGLE:
1762 set_error (context, error, G_MARKUP_ERROR_PARSE,
1763 _("Document ended unexpectedly just after an open angle bracket '<'"));
1766 case STATE_AFTER_CLOSE_ANGLE:
1767 if (context->tag_stack != NULL)
1769 /* Error message the same as for INSIDE_TEXT */
1770 set_error (context, error, G_MARKUP_ERROR_PARSE,
1771 _("Document ended unexpectedly with elements still open - "
1772 "'%s' was the last element opened"),
1773 current_element (context));
1777 case STATE_AFTER_ELISION_SLASH:
1778 set_error (context, error, G_MARKUP_ERROR_PARSE,
1779 _("Document ended unexpectedly, expected to see a close angle "
1780 "bracket ending the tag <%s/>"), current_element (context));
1783 case STATE_INSIDE_OPEN_TAG_NAME:
1784 set_error (context, error, G_MARKUP_ERROR_PARSE,
1785 _("Document ended unexpectedly inside an element name"));
1788 case STATE_INSIDE_ATTRIBUTE_NAME:
1789 set_error (context, error, G_MARKUP_ERROR_PARSE,
1790 _("Document ended unexpectedly inside an attribute name"));
1793 case STATE_BETWEEN_ATTRIBUTES:
1794 set_error (context, error, G_MARKUP_ERROR_PARSE,
1795 _("Document ended unexpectedly inside an element-opening "
1799 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1800 set_error (context, error, G_MARKUP_ERROR_PARSE,
1801 _("Document ended unexpectedly after the equals sign "
1802 "following an attribute name; no attribute value"));
1805 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1806 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1807 set_error (context, error, G_MARKUP_ERROR_PARSE,
1808 _("Document ended unexpectedly while inside an attribute "
1812 case STATE_INSIDE_TEXT:
1813 g_assert (context->tag_stack != NULL);
1814 set_error (context, error, G_MARKUP_ERROR_PARSE,
1815 _("Document ended unexpectedly with elements still open - "
1816 "'%s' was the last element opened"),
1817 current_element (context));
1820 case STATE_AFTER_CLOSE_TAG_SLASH:
1821 case STATE_INSIDE_CLOSE_TAG_NAME:
1822 set_error (context, error, G_MARKUP_ERROR_PARSE,
1823 _("Document ended unexpectedly inside the close tag for "
1824 "element '%s'"), current_element (context));
1827 case STATE_INSIDE_PASSTHROUGH:
1828 set_error (context, error, G_MARKUP_ERROR_PARSE,
1829 _("Document ended unexpectedly inside a comment or "
1830 "processing instruction"));
1835 g_assert_not_reached ();
1839 context->parsing = FALSE;
1841 return context->state != STATE_ERROR;
1845 * g_markup_parse_context_get_element:
1846 * @context: a #GMarkupParseContext
1847 * @returns: the name of the currently open element, or %NULL
1849 * Retrieves the name of the currently open element.
1853 G_CONST_RETURN gchar *
1854 g_markup_parse_context_get_element (GMarkupParseContext *context)
1856 g_return_val_if_fail (context != NULL, NULL);
1858 if (context->tag_stack == NULL)
1861 return current_element (context);
1865 * g_markup_parse_context_get_position:
1866 * @context: a #GMarkupParseContext
1867 * @line_number: return location for a line number, or %NULL
1868 * @char_number: return location for a char-on-line number, or %NULL
1870 * Retrieves the current line number and the number of the character on
1871 * that line. Intended for use in error messages; there are no strict
1872 * semantics for what constitutes the "current" line number other than
1873 * "the best number we could come up with for error messages."
1877 g_markup_parse_context_get_position (GMarkupParseContext *context,
1881 g_return_if_fail (context != NULL);
1884 *line_number = context->line_number;
1887 *char_number = context->char_number;
1891 append_escaped_text (GString *str,
1899 end = text + length;
1904 next = g_utf8_next_char (p);
1909 g_string_append (str, "&");
1913 g_string_append (str, "<");
1917 g_string_append (str, ">");
1921 g_string_append (str, "'");
1925 g_string_append (str, """);
1929 g_string_append_len (str, p, next - p);
1938 * g_markup_escape_text:
1939 * @text: some valid UTF-8 text
1940 * @length: length of @text in bytes, or -1 if the text is nul-terminated
1942 * Escapes text so that the markup parser will parse it verbatim.
1943 * Less than, greater than, ampersand, etc. are replaced with the
1944 * corresponding entities. This function would typically be used
1945 * when writing out a file to be parsed with the markup parser.
1947 * Note that this function doesn't protect whitespace and line endings
1948 * from being processed according to the XML rules for normalization
1949 * of line endings and attribute values.
1951 * Return value: a newly allocated string with the escaped text
1954 g_markup_escape_text (const gchar *text,
1959 g_return_val_if_fail (text != NULL, NULL);
1962 length = strlen (text);
1964 /* prealloc at least as long as original text */
1965 str = g_string_sized_new (length);
1966 append_escaped_text (str, text, length);
1968 return g_string_free (str, FALSE);
1973 * @format: a printf-style format string
1974 * @after: location to store a pointer to the character after
1975 * the returned conversion. On a %NULL return, returns the
1976 * pointer to the trailing NUL in the string
1978 * Find the next conversion in a printf-style format string.
1979 * Partially based on code from printf-parser.c,
1980 * Copyright (C) 1999-2000, 2002-2003 Free Software Foundation, Inc.
1982 * Return value: pointer to the next conversion in @format,
1983 * or %NULL, if none.
1986 find_conversion (const char *format,
1989 const char *start = format;
1992 while (*start != '\0' && *start != '%')
2009 /* Test for positional argument. */
2010 if (*cp >= '0' && *cp <= '9')
2014 for (np = cp; *np >= '0' && *np <= '9'; np++)
2020 /* Skip the flags. */
2034 /* Skip the field width. */
2039 /* Test for positional argument. */
2040 if (*cp >= '0' && *cp <= '9')
2044 for (np = cp; *np >= '0' && *np <= '9'; np++)
2052 for (; *cp >= '0' && *cp <= '9'; cp++)
2056 /* Skip the precision. */
2062 /* Test for positional argument. */
2063 if (*cp >= '0' && *cp <= '9')
2067 for (np = cp; *np >= '0' && *np <= '9'; np++)
2075 for (; *cp >= '0' && *cp <= '9'; cp++)
2080 /* Skip argument type/size specifiers. */
2081 while (*cp == 'h' ||
2090 /* Skip the conversion character. */
2098 * g_markup_vprintf_escaped:
2099 * @format: printf() style format string
2100 * @args: variable argument list, similar to vprintf()
2102 * Formats the data in @args according to @format, escaping
2103 * all string and character arguments in the fashion
2104 * of g_markup_escape_text(). See g_markup_printf_escaped().
2106 * Return value: newly allocated result from formatting
2107 * operation. Free with g_free().
2112 g_markup_vprintf_escaped (const char *format,
2117 GString *result = NULL;
2118 gchar *output1 = NULL;
2119 gchar *output2 = NULL;
2120 const char *p, *op1, *op2;
2123 /* The technique here, is that we make two format strings that
2124 * have the identical conversions in the identical order to the
2125 * original strings, but differ in the text in-between. We
2126 * then use the normal g_strdup_vprintf() to format the arguments
2127 * with the two new format strings. By comparing the results,
2128 * we can figure out what segments of the output come from
2129 * the the original format string, and what from the arguments,
2130 * and thus know what portions of the string to escape.
2132 * For instance, for:
2134 * g_markup_printf_escaped ("%s ate %d apples", "Susan & Fred", 5);
2136 * We form the two format strings "%sX%dX" and %sY%sY". The results
2137 * of formatting with those two strings are
2139 * "%sX%dX" => "Susan & FredX5X"
2140 * "%sY%dY" => "Susan & FredY5Y"
2142 * To find the span of the first argument, we find the first position
2143 * where the two arguments differ, which tells us that the first
2144 * argument formatted to "Susan & Fred". We then escape that
2145 * to "Susan & Fred" and join up with the intermediate portions
2146 * of the format string and the second argument to get
2147 * "Susan & Fred ate 5 apples".
2150 /* Create the two modified format strings
2152 format1 = g_string_new (NULL);
2153 format2 = g_string_new (NULL);
2158 const char *conv = find_conversion (p, &after);
2162 g_string_append_len (format1, conv, after - conv);
2163 g_string_append_c (format1, 'X');
2164 g_string_append_len (format2, conv, after - conv);
2165 g_string_append_c (format2, 'Y');
2170 /* Use them to format the arguments
2172 G_VA_COPY (args2, args);
2174 output1 = g_strdup_vprintf (format1->str, args);
2179 output2 = g_strdup_vprintf (format2->str, args2);
2184 result = g_string_new (NULL);
2186 /* Iterate through the original format string again,
2187 * copying the non-conversion portions and the escaped
2188 * converted arguments to the output string.
2196 const char *output_start;
2197 const char *conv = find_conversion (p, &after);
2200 if (!conv) /* The end, after points to the trailing \0 */
2202 g_string_append_len (result, p, after - p);
2206 g_string_append_len (result, p, conv - p);
2208 while (*op1 == *op2)
2214 escaped = g_markup_escape_text (output_start, op1 - output_start);
2215 g_string_append (result, escaped);
2224 g_string_free (format1, TRUE);
2225 g_string_free (format2, TRUE);
2230 return g_string_free (result, FALSE);
2236 * g_markup_printf_escaped:
2237 * @format: printf() style format string
2238 * @Varargs: the arguments to insert in the format string
2240 * Formats arguments according to @format, escaping
2241 * all string and character arguments in the fashion
2242 * of g_markup_escape_text(). This is useful when you
2243 * want to insert literal strings into XML-style markup
2244 * output, without having to worry that the strings
2245 * might themselves contain markup.
2247 * <informalexample><programlisting>
2248 * const char *store = "Fortnum & Mason";
2249 * const char *item = "Tea";
2252 * output = g_markup_printf_escaped ("<purchase>"
2253 * "<store>%s</store>"
2254 * "<item>%s</item>"
2255 * "</purchase>",
2257 * </programlisting></informalexample>
2259 * Return value: newly allocated result from formatting
2260 * operation. Free with g_free().
2265 g_markup_printf_escaped (const char *format, ...)
2270 va_start (args, format);
2271 result = g_markup_vprintf_escaped (format, args);
2277 #define __G_MARKUP_C__
2278 #include "galiasdef.c"