1 /* gmarkup.c - Simple XML-like parser
3 * Copyright 2000, 2003 Red Hat, Inc.
5 * GLib is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * GLib is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with GLib; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
35 g_markup_error_quark (void)
37 return g_quark_from_static_string ("g-markup-error-quark");
43 STATE_AFTER_OPEN_ANGLE,
44 STATE_AFTER_CLOSE_ANGLE,
45 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
46 STATE_INSIDE_OPEN_TAG_NAME,
47 STATE_INSIDE_ATTRIBUTE_NAME,
48 STATE_AFTER_ATTRIBUTE_NAME,
49 STATE_BETWEEN_ATTRIBUTES,
50 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
51 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
52 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
54 STATE_AFTER_CLOSE_TAG_SLASH,
55 STATE_INSIDE_CLOSE_TAG_NAME,
56 STATE_AFTER_CLOSE_TAG_NAME,
57 STATE_INSIDE_PASSTHROUGH,
61 struct _GMarkupParseContext
63 const GMarkupParser *parser;
65 GMarkupParseFlags flags;
71 GDestroyNotify dnotify;
73 /* A piece of character data or an element that
74 * hasn't "ended" yet so we haven't yet called
75 * the callback for it.
77 GString *partial_chunk;
79 GMarkupParseState state;
86 const gchar *current_text;
87 gssize current_text_len;
88 const gchar *current_text_end;
90 GString *leftover_char_portion;
92 /* used to save the start of the last interesting thingy */
97 guint document_empty : 1;
103 * g_markup_parse_context_new:
104 * @parser: a #GMarkupParser
105 * @flags: one or more #GMarkupParseFlags
106 * @user_data: user data to pass to #GMarkupParser functions
107 * @user_data_dnotify: user data destroy notifier called when the parse context is freed
109 * Creates a new parse context. A parse context is used to parse
110 * marked-up documents. You can feed any number of documents into
111 * a context, as long as no errors occur; once an error occurs,
112 * the parse context can't continue to parse text (you have to free it
113 * and create a new parse context).
115 * Return value: a new #GMarkupParseContext
117 GMarkupParseContext *
118 g_markup_parse_context_new (const GMarkupParser *parser,
119 GMarkupParseFlags flags,
121 GDestroyNotify user_data_dnotify)
123 GMarkupParseContext *context;
125 g_return_val_if_fail (parser != NULL, NULL);
127 context = g_new (GMarkupParseContext, 1);
129 context->parser = parser;
130 context->flags = flags;
131 context->user_data = user_data;
132 context->dnotify = user_data_dnotify;
134 context->line_number = 1;
135 context->char_number = 1;
137 context->partial_chunk = NULL;
139 context->state = STATE_START;
140 context->tag_stack = NULL;
141 context->attr_names = NULL;
142 context->attr_values = NULL;
143 context->cur_attr = -1;
144 context->alloc_attrs = 0;
146 context->current_text = NULL;
147 context->current_text_len = -1;
148 context->current_text_end = NULL;
149 context->leftover_char_portion = NULL;
151 context->start = NULL;
152 context->iter = NULL;
154 context->document_empty = TRUE;
155 context->parsing = FALSE;
157 context->balance = 0;
163 * g_markup_parse_context_free:
164 * @context: a #GMarkupParseContext
166 * Frees a #GMarkupParseContext. Can't be called from inside
167 * one of the #GMarkupParser functions.
171 g_markup_parse_context_free (GMarkupParseContext *context)
173 g_return_if_fail (context != NULL);
174 g_return_if_fail (!context->parsing);
176 if (context->dnotify)
177 (* context->dnotify) (context->user_data);
179 g_strfreev (context->attr_names);
180 g_strfreev (context->attr_values);
182 g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
183 g_slist_free (context->tag_stack);
185 if (context->partial_chunk)
186 g_string_free (context->partial_chunk, TRUE);
188 if (context->leftover_char_portion)
189 g_string_free (context->leftover_char_portion, TRUE);
195 mark_error (GMarkupParseContext *context,
198 context->state = STATE_ERROR;
200 if (context->parser->error)
201 (*context->parser->error) (context, error, context->user_data);
204 static void set_error (GMarkupParseContext *context,
208 ...) G_GNUC_PRINTF (4, 5);
211 set_error (GMarkupParseContext *context,
221 va_start (args, format);
222 s = g_strdup_vprintf (format, args);
225 tmp_error = g_error_new (G_MARKUP_ERROR,
227 _("Error on line %d char %d: %s"),
228 context->line_number,
229 context->char_number,
234 mark_error (context, tmp_error);
236 g_propagate_error (error, tmp_error);
240 /* To make these faster, we first use the ascii-only tests, then check
241 * for the usual non-alnum name-end chars, and only then call the
242 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
243 * names, so this is a reasonable hack that virtually always avoids
246 #define IS_COMMON_NAME_END_CHAR(c) \
247 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
250 is_name_start_char (const gchar *p)
252 if (g_ascii_isalpha (*p) ||
253 (!IS_COMMON_NAME_END_CHAR (*p) &&
256 g_unichar_isalpha (g_utf8_get_char (p)))))
263 is_name_char (const gchar *p)
265 if (g_ascii_isalnum (*p) ||
266 (!IS_COMMON_NAME_END_CHAR (*p) &&
271 g_unichar_isalpha (g_utf8_get_char (p)))))
279 char_str (gunichar c,
283 g_unichar_to_utf8 (c, buf);
288 utf8_str (const gchar *utf8,
291 char_str (g_utf8_get_char (utf8), buf);
296 set_unescape_error (GMarkupParseContext *context,
298 const gchar *remaining_text,
299 const gchar *remaining_text_end,
307 gint remaining_newlines;
310 remaining_newlines = 0;
312 while (p != remaining_text_end)
315 ++remaining_newlines;
319 va_start (args, format);
320 s = g_strdup_vprintf (format, args);
323 tmp_error = g_error_new (G_MARKUP_ERROR,
325 _("Error on line %d: %s"),
326 context->line_number - remaining_newlines,
331 mark_error (context, tmp_error);
333 g_propagate_error (error, tmp_error);
339 USTATE_AFTER_AMPERSAND,
340 USTATE_INSIDE_ENTITY_NAME,
341 USTATE_AFTER_CHARREF_HASH
346 GMarkupParseContext *context;
350 const gchar *text_end;
351 const gchar *entity_start;
355 unescape_text_state_inside_text (UnescapeContext *ucontext,
360 gboolean normalize_attribute;
362 if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
363 ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
364 normalize_attribute = TRUE;
366 normalize_attribute = FALSE;
370 while (p != ucontext->text_end)
376 else if (normalize_attribute && (*p == '\t' || *p == '\n'))
378 g_string_append_len (ucontext->str, start, p - start);
379 g_string_append_c (ucontext->str, ' ');
380 p = g_utf8_next_char (p);
385 g_string_append_len (ucontext->str, start, p - start);
386 g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
387 p = g_utf8_next_char (p);
388 if (p != ucontext->text_end && *p == '\n')
389 p = g_utf8_next_char (p);
393 p = g_utf8_next_char (p);
397 g_string_append_len (ucontext->str, start, p - start);
399 if (p != ucontext->text_end && *p == '&')
401 p = g_utf8_next_char (p);
402 ucontext->state = USTATE_AFTER_AMPERSAND;
409 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
413 ucontext->entity_start = NULL;
417 p = g_utf8_next_char (p);
419 ucontext->entity_start = p;
420 ucontext->state = USTATE_AFTER_CHARREF_HASH;
422 else if (!is_name_start_char (p))
426 set_unescape_error (ucontext->context, error,
427 p, ucontext->text_end,
428 G_MARKUP_ERROR_PARSE,
429 _("Empty entity '&;' seen; valid "
430 "entities are: & " < > '"));
436 set_unescape_error (ucontext->context, error,
437 p, ucontext->text_end,
438 G_MARKUP_ERROR_PARSE,
439 _("Character '%s' is not valid at "
440 "the start of an entity name; "
441 "the & character begins an entity; "
442 "if this ampersand isn't supposed "
443 "to be an entity, escape it as "
450 ucontext->entity_start = p;
451 ucontext->state = USTATE_INSIDE_ENTITY_NAME;
458 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
462 while (p != ucontext->text_end)
466 else if (!is_name_char (p))
470 set_unescape_error (ucontext->context, error,
471 p, ucontext->text_end,
472 G_MARKUP_ERROR_PARSE,
473 _("Character '%s' is not valid "
474 "inside an entity name"),
479 p = g_utf8_next_char (p);
482 if (ucontext->context->state != STATE_ERROR)
484 if (p != ucontext->text_end)
486 gint len = p - ucontext->entity_start;
488 /* move to after semicolon */
489 p = g_utf8_next_char (p);
490 ucontext->state = USTATE_INSIDE_TEXT;
492 if (strncmp (ucontext->entity_start, "lt", len) == 0)
493 g_string_append_c (ucontext->str, '<');
494 else if (strncmp (ucontext->entity_start, "gt", len) == 0)
495 g_string_append_c (ucontext->str, '>');
496 else if (strncmp (ucontext->entity_start, "amp", len) == 0)
497 g_string_append_c (ucontext->str, '&');
498 else if (strncmp (ucontext->entity_start, "quot", len) == 0)
499 g_string_append_c (ucontext->str, '"');
500 else if (strncmp (ucontext->entity_start, "apos", len) == 0)
501 g_string_append_c (ucontext->str, '\'');
506 name = g_strndup (ucontext->entity_start, len);
507 set_unescape_error (ucontext->context, error,
508 p, ucontext->text_end,
509 G_MARKUP_ERROR_PARSE,
510 _("Entity name '%s' is not known"),
517 set_unescape_error (ucontext->context, error,
518 /* give line number of the & */
519 ucontext->entity_start, ucontext->text_end,
520 G_MARKUP_ERROR_PARSE,
521 _("Entity did not end with a semicolon; "
522 "most likely you used an ampersand "
523 "character without intending to start "
524 "an entity - escape ampersand as &"));
533 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
537 gboolean is_hex = FALSE;
540 start = ucontext->entity_start;
545 p = g_utf8_next_char (p);
549 while (p != ucontext->text_end && *p != ';')
550 p = g_utf8_next_char (p);
552 if (p != ucontext->text_end)
554 g_assert (*p == ';');
556 /* digit is between start and p */
565 l = strtoul (start, &end, 16);
567 l = strtoul (start, &end, 10);
569 if (end != p || errno != 0)
571 set_unescape_error (ucontext->context, error,
572 start, ucontext->text_end,
573 G_MARKUP_ERROR_PARSE,
574 _("Failed to parse '%-.*s', which "
575 "should have been a digit "
576 "inside a character reference "
577 "(ê for example) - perhaps "
578 "the digit is too large"),
583 /* characters XML permits */
587 (l >= 0x20 && l <= 0xD7FF) ||
588 (l >= 0xE000 && l <= 0xFFFD) ||
589 (l >= 0x10000 && l <= 0x10FFFF))
592 g_string_append (ucontext->str, char_str (l, buf));
596 set_unescape_error (ucontext->context, error,
597 start, ucontext->text_end,
598 G_MARKUP_ERROR_PARSE,
599 _("Character reference '%-.*s' does not "
600 "encode a permitted character"),
605 /* Move to next state */
606 p = g_utf8_next_char (p); /* past semicolon */
607 ucontext->state = USTATE_INSIDE_TEXT;
611 set_unescape_error (ucontext->context, error,
612 start, ucontext->text_end,
613 G_MARKUP_ERROR_PARSE,
614 _("Empty character reference; "
615 "should include a digit such as "
621 set_unescape_error (ucontext->context, error,
622 start, ucontext->text_end,
623 G_MARKUP_ERROR_PARSE,
624 _("Character reference did not end with a "
626 "most likely you used an ampersand "
627 "character without intending to start "
628 "an entity - escape ampersand as &"));
635 unescape_text (GMarkupParseContext *context,
637 const gchar *text_end,
641 UnescapeContext ucontext;
644 ucontext.context = context;
645 ucontext.text = text;
646 ucontext.text_end = text_end;
647 ucontext.entity_start = NULL;
649 ucontext.str = g_string_sized_new (text_end - text);
651 ucontext.state = USTATE_INSIDE_TEXT;
654 while (p != text_end && context->state != STATE_ERROR)
656 g_assert (p < text_end);
658 switch (ucontext.state)
660 case USTATE_INSIDE_TEXT:
662 p = unescape_text_state_inside_text (&ucontext,
668 case USTATE_AFTER_AMPERSAND:
670 p = unescape_text_state_after_ampersand (&ucontext,
677 case USTATE_INSIDE_ENTITY_NAME:
679 p = unescape_text_state_inside_entity_name (&ucontext,
685 case USTATE_AFTER_CHARREF_HASH:
687 p = unescape_text_state_after_charref_hash (&ucontext,
694 g_assert_not_reached ();
699 if (context->state != STATE_ERROR)
701 switch (ucontext.state)
703 case USTATE_INSIDE_TEXT:
705 case USTATE_AFTER_AMPERSAND:
706 case USTATE_INSIDE_ENTITY_NAME:
707 set_unescape_error (context, error,
709 G_MARKUP_ERROR_PARSE,
710 _("Unfinished entity reference"));
712 case USTATE_AFTER_CHARREF_HASH:
713 set_unescape_error (context, error,
715 G_MARKUP_ERROR_PARSE,
716 _("Unfinished character reference"));
721 if (context->state == STATE_ERROR)
723 g_string_free (ucontext.str, TRUE);
729 *unescaped = ucontext.str;
734 static inline gboolean
735 advance_char (GMarkupParseContext *context)
737 context->iter = g_utf8_next_char (context->iter);
738 context->char_number += 1;
740 if (context->iter == context->current_text_end)
744 else if (*context->iter == '\n')
746 context->line_number += 1;
747 context->char_number = 1;
753 static inline gboolean
756 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
760 skip_spaces (GMarkupParseContext *context)
764 if (!xml_isspace (*context->iter))
767 while (advance_char (context));
771 advance_to_name_end (GMarkupParseContext *context)
775 if (!is_name_char (context->iter))
778 while (advance_char (context));
782 add_to_partial (GMarkupParseContext *context,
783 const gchar *text_start,
784 const gchar *text_end)
786 if (context->partial_chunk == NULL)
787 context->partial_chunk = g_string_sized_new (text_end - text_start);
789 if (text_start != text_end)
790 g_string_append_len (context->partial_chunk, text_start,
791 text_end - text_start);
793 /* Invariant here that partial_chunk exists */
797 truncate_partial (GMarkupParseContext *context)
799 if (context->partial_chunk != NULL)
801 context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
806 current_element (GMarkupParseContext *context)
808 return context->tag_stack->data;
812 current_attribute (GMarkupParseContext *context)
814 g_assert (context->cur_attr >= 0);
815 return context->attr_names[context->cur_attr];
819 find_current_text_end (GMarkupParseContext *context)
821 /* This function must be safe (non-segfaulting) on invalid UTF8.
822 * It assumes the string starts with a character start
824 const gchar *end = context->current_text + context->current_text_len;
828 g_assert (context->current_text_len > 0);
830 p = g_utf8_find_prev_char (context->current_text, end);
832 g_assert (p != NULL); /* since current_text was a char start */
834 /* p is now the start of the last character or character portion. */
836 next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
840 /* whole character */
841 context->current_text_end = end;
846 context->leftover_char_portion = g_string_new_len (p, end - p);
847 context->current_text_len -= (end - p);
848 context->current_text_end = p;
854 add_attribute (GMarkupParseContext *context, char *name)
856 if (context->cur_attr + 2 >= context->alloc_attrs)
858 context->alloc_attrs += 5; /* silly magic number */
859 context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
860 context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
863 context->attr_names[context->cur_attr] = name;
864 context->attr_values[context->cur_attr] = NULL;
865 context->attr_names[context->cur_attr+1] = NULL;
866 context->attr_values[context->cur_attr+1] = NULL;
870 * g_markup_parse_context_parse:
871 * @context: a #GMarkupParseContext
872 * @text: chunk of text to parse
873 * @text_len: length of @text in bytes
874 * @error: return location for a #GError
876 * Feed some data to the #GMarkupParseContext. The data need not
877 * be valid UTF-8; an error will be signaled if it's invalid.
878 * The data need not be an entire document; you can feed a document
879 * into the parser incrementally, via multiple calls to this function.
880 * Typically, as you receive data from a network connection or file,
881 * you feed each received chunk of data into this function, aborting
882 * the process if an error occurs. Once an error is reported, no further
883 * data may be fed to the #GMarkupParseContext; all errors are fatal.
885 * Return value: %FALSE if an error occurred, %TRUE on success
888 g_markup_parse_context_parse (GMarkupParseContext *context,
893 const gchar *first_invalid;
895 g_return_val_if_fail (context != NULL, FALSE);
896 g_return_val_if_fail (text != NULL, FALSE);
897 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
898 g_return_val_if_fail (!context->parsing, FALSE);
901 text_len = strlen (text);
906 context->parsing = TRUE;
908 if (context->leftover_char_portion)
910 const gchar *first_char;
912 if ((*text & 0xc0) != 0x80)
915 first_char = g_utf8_find_next_char (text, text + text_len);
919 /* leftover_char_portion was completed. Parse it. */
920 GString *portion = context->leftover_char_portion;
922 g_string_append_len (context->leftover_char_portion,
923 text, first_char - text);
925 /* hacks to allow recursion */
926 context->parsing = FALSE;
927 context->leftover_char_portion = NULL;
929 if (!g_markup_parse_context_parse (context,
930 portion->str, portion->len,
933 g_assert (context->state == STATE_ERROR);
936 g_string_free (portion, TRUE);
937 context->parsing = TRUE;
939 /* Skip the fraction of char that was in this text */
940 text_len -= (first_char - text);
945 /* another little chunk of the leftover char; geez
946 * someone is inefficient.
948 g_string_append_len (context->leftover_char_portion,
951 if (context->leftover_char_portion->len > 7)
953 /* The leftover char portion is too big to be
958 G_MARKUP_ERROR_BAD_UTF8,
959 _("Invalid UTF-8 encoded text"));
966 context->current_text = text;
967 context->current_text_len = text_len;
968 context->iter = context->current_text;
969 context->start = context->iter;
971 /* Nothing left after finishing the leftover char, or nothing
972 * passed in to begin with.
974 if (context->current_text_len == 0)
977 /* find_current_text_end () assumes the string starts at
978 * a character start, so we need to validate at least
979 * that much. It doesn't assume any following bytes
982 if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
986 G_MARKUP_ERROR_BAD_UTF8,
987 _("Invalid UTF-8 encoded text"));
991 /* Initialize context->current_text_end, possibly adjusting
992 * current_text_len, and add any leftover char portion
994 find_current_text_end (context);
996 /* Validate UTF8 (must be done after we find the end, since
997 * we could have a trailing incomplete char)
999 if (!g_utf8_validate (context->current_text,
1000 context->current_text_len,
1005 p = context->current_text;
1006 while (p != context->current_text_end)
1013 context->line_number += newlines;
1017 G_MARKUP_ERROR_BAD_UTF8,
1018 _("Invalid UTF-8 encoded text"));
1022 while (context->iter != context->current_text_end)
1024 switch (context->state)
1027 /* Possible next state: AFTER_OPEN_ANGLE */
1029 g_assert (context->tag_stack == NULL);
1031 /* whitespace is ignored outside of any elements */
1032 skip_spaces (context);
1034 if (context->iter != context->current_text_end)
1036 if (*context->iter == '<')
1038 /* Move after the open angle */
1039 advance_char (context);
1041 context->state = STATE_AFTER_OPEN_ANGLE;
1043 /* this could start a passthrough */
1044 context->start = context->iter;
1046 /* document is now non-empty */
1047 context->document_empty = FALSE;
1053 G_MARKUP_ERROR_PARSE,
1054 _("Document must begin with an element (e.g. <book>)"));
1059 case STATE_AFTER_OPEN_ANGLE:
1060 /* Possible next states: INSIDE_OPEN_TAG_NAME,
1061 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
1063 if (*context->iter == '?' ||
1064 *context->iter == '!')
1066 /* include < in the passthrough */
1067 const gchar *openangle = "<";
1068 add_to_partial (context, openangle, openangle + 1);
1069 context->start = context->iter;
1070 context->balance = 1;
1071 context->state = STATE_INSIDE_PASSTHROUGH;
1073 else if (*context->iter == '/')
1076 advance_char (context);
1078 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
1080 else if (is_name_start_char (context->iter))
1082 context->state = STATE_INSIDE_OPEN_TAG_NAME;
1084 /* start of tag name */
1085 context->start = context->iter;
1093 G_MARKUP_ERROR_PARSE,
1094 _("'%s' is not a valid character following "
1095 "a '<' character; it may not begin an "
1097 utf8_str (context->iter, buf));
1101 /* The AFTER_CLOSE_ANGLE state is actually sort of
1102 * broken, because it doesn't correspond to a range
1103 * of characters in the input stream as the others do,
1104 * and thus makes things harder to conceptualize
1106 case STATE_AFTER_CLOSE_ANGLE:
1107 /* Possible next states: INSIDE_TEXT, STATE_START */
1108 if (context->tag_stack == NULL)
1110 context->start = NULL;
1111 context->state = STATE_START;
1115 context->start = context->iter;
1116 context->state = STATE_INSIDE_TEXT;
1120 case STATE_AFTER_ELISION_SLASH:
1121 /* Possible next state: AFTER_CLOSE_ANGLE */
1124 /* We need to pop the tag stack and call the end_element
1125 * function, since this is the close tag
1127 GError *tmp_error = NULL;
1129 g_assert (context->tag_stack != NULL);
1132 if (context->parser->end_element)
1133 (* context->parser->end_element) (context,
1134 context->tag_stack->data,
1140 mark_error (context, tmp_error);
1141 g_propagate_error (error, tmp_error);
1145 if (*context->iter == '>')
1147 /* move after the close angle */
1148 advance_char (context);
1149 context->state = STATE_AFTER_CLOSE_ANGLE;
1157 G_MARKUP_ERROR_PARSE,
1158 _("Odd character '%s', expected a '>' character "
1159 "to end the start tag of element '%s'"),
1160 utf8_str (context->iter, buf),
1161 current_element (context));
1165 g_free (context->tag_stack->data);
1166 context->tag_stack = g_slist_delete_link (context->tag_stack,
1167 context->tag_stack);
1171 case STATE_INSIDE_OPEN_TAG_NAME:
1172 /* Possible next states: BETWEEN_ATTRIBUTES */
1174 /* if there's a partial chunk then it's the first part of the
1175 * tag name. If there's a context->start then it's the start
1176 * of the tag name in current_text, the partial chunk goes
1177 * before that start though.
1179 advance_to_name_end (context);
1181 if (context->iter == context->current_text_end)
1183 /* The name hasn't necessarily ended. Merge with
1184 * partial chunk, leave state unchanged.
1186 add_to_partial (context, context->start, context->iter);
1190 /* The name has ended. Combine it with the partial chunk
1191 * if any; push it on the stack; enter next state.
1193 add_to_partial (context, context->start, context->iter);
1194 context->tag_stack =
1195 g_slist_prepend (context->tag_stack,
1196 g_string_free (context->partial_chunk,
1199 context->partial_chunk = NULL;
1201 context->state = STATE_BETWEEN_ATTRIBUTES;
1202 context->start = NULL;
1206 case STATE_INSIDE_ATTRIBUTE_NAME:
1207 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1209 advance_to_name_end (context);
1210 add_to_partial (context, context->start, context->iter);
1212 /* read the full name, if we enter the equals sign state
1213 * then add the attribute to the list (without the value),
1214 * otherwise store a partial chunk to be prepended later.
1216 if (context->iter != context->current_text_end)
1217 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1220 case STATE_AFTER_ATTRIBUTE_NAME:
1221 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1223 skip_spaces (context);
1225 if (context->iter != context->current_text_end)
1227 /* The name has ended. Combine it with the partial chunk
1228 * if any; push it on the stack; enter next state.
1230 add_attribute (context, g_string_free (context->partial_chunk, FALSE));
1232 context->partial_chunk = NULL;
1233 context->start = NULL;
1235 if (*context->iter == '=')
1237 advance_char (context);
1238 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1246 G_MARKUP_ERROR_PARSE,
1247 _("Odd character '%s', expected a '=' after "
1248 "attribute name '%s' of element '%s'"),
1249 utf8_str (context->iter, buf),
1250 current_attribute (context),
1251 current_element (context));
1257 case STATE_BETWEEN_ATTRIBUTES:
1258 /* Possible next states: AFTER_CLOSE_ANGLE,
1259 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1261 skip_spaces (context);
1263 if (context->iter != context->current_text_end)
1265 if (*context->iter == '/')
1267 advance_char (context);
1268 context->state = STATE_AFTER_ELISION_SLASH;
1270 else if (*context->iter == '>')
1273 advance_char (context);
1274 context->state = STATE_AFTER_CLOSE_ANGLE;
1276 else if (is_name_start_char (context->iter))
1278 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1279 /* start of attribute name */
1280 context->start = context->iter;
1288 G_MARKUP_ERROR_PARSE,
1289 _("Odd character '%s', expected a '>' or '/' "
1290 "character to end the start tag of "
1291 "element '%s', or optionally an attribute; "
1292 "perhaps you used an invalid character in "
1293 "an attribute name"),
1294 utf8_str (context->iter, buf),
1295 current_element (context));
1298 /* If we're done with attributes, invoke
1299 * the start_element callback
1301 if (context->state == STATE_AFTER_ELISION_SLASH ||
1302 context->state == STATE_AFTER_CLOSE_ANGLE)
1304 const gchar *start_name;
1305 /* Ugly, but the current code expects an empty array instead of NULL */
1306 const gchar *empty = NULL;
1307 const gchar **attr_names = ∅
1308 const gchar **attr_values = ∅
1311 /* Call user callback for element start */
1312 start_name = current_element (context);
1314 if (context->cur_attr >= 0)
1316 attr_names = (const gchar**)context->attr_names;
1317 attr_values = (const gchar**)context->attr_values;
1321 if (context->parser->start_element)
1322 (* context->parser->start_element) (context,
1324 (const gchar **)attr_names,
1325 (const gchar **)attr_values,
1329 /* Go ahead and free the attributes. */
1330 for (; context->cur_attr >= 0; context->cur_attr--)
1332 int pos = context->cur_attr;
1333 g_free (context->attr_names[pos]);
1334 g_free (context->attr_values[pos]);
1335 context->attr_names[pos] = context->attr_values[pos] = NULL;
1337 g_assert (context->cur_attr == -1);
1338 g_assert (context->attr_names == NULL ||
1339 context->attr_names[0] == NULL);
1340 g_assert (context->attr_values == NULL ||
1341 context->attr_values[0] == NULL);
1343 if (tmp_error != NULL)
1345 mark_error (context, tmp_error);
1346 g_propagate_error (error, tmp_error);
1352 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1353 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1355 skip_spaces (context);
1357 if (context->iter != context->current_text_end)
1359 if (*context->iter == '"')
1361 advance_char (context);
1362 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1363 context->start = context->iter;
1365 else if (*context->iter == '\'')
1367 advance_char (context);
1368 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1369 context->start = context->iter;
1377 G_MARKUP_ERROR_PARSE,
1378 _("Odd character '%s', expected an open quote mark "
1379 "after the equals sign when giving value for "
1380 "attribute '%s' of element '%s'"),
1381 utf8_str (context->iter, buf),
1382 current_attribute (context),
1383 current_element (context));
1388 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1389 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1390 /* Possible next states: BETWEEN_ATTRIBUTES */
1394 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1405 if (*context->iter == delim)
1408 while (advance_char (context));
1410 if (context->iter == context->current_text_end)
1412 /* The value hasn't necessarily ended. Merge with
1413 * partial chunk, leave state unchanged.
1415 add_to_partial (context, context->start, context->iter);
1419 /* The value has ended at the quote mark. Combine it
1420 * with the partial chunk if any; set it for the current
1425 add_to_partial (context, context->start, context->iter);
1427 g_assert (context->cur_attr >= 0);
1429 if (unescape_text (context,
1430 context->partial_chunk->str,
1431 context->partial_chunk->str +
1432 context->partial_chunk->len,
1436 /* success, advance past quote and set state. */
1437 context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
1438 advance_char (context);
1439 context->state = STATE_BETWEEN_ATTRIBUTES;
1440 context->start = NULL;
1443 truncate_partial (context);
1447 case STATE_INSIDE_TEXT:
1448 /* Possible next states: AFTER_OPEN_ANGLE */
1451 if (*context->iter == '<')
1454 while (advance_char (context));
1456 /* The text hasn't necessarily ended. Merge with
1457 * partial chunk, leave state unchanged.
1460 add_to_partial (context, context->start, context->iter);
1462 if (context->iter != context->current_text_end)
1464 GString *unescaped = NULL;
1466 /* The text has ended at the open angle. Call the text
1470 if (unescape_text (context,
1471 context->partial_chunk->str,
1472 context->partial_chunk->str +
1473 context->partial_chunk->len,
1477 GError *tmp_error = NULL;
1479 if (context->parser->text)
1480 (*context->parser->text) (context,
1486 g_string_free (unescaped, TRUE);
1488 if (tmp_error == NULL)
1490 /* advance past open angle and set state. */
1491 advance_char (context);
1492 context->state = STATE_AFTER_OPEN_ANGLE;
1493 /* could begin a passthrough */
1494 context->start = context->iter;
1498 mark_error (context, tmp_error);
1499 g_propagate_error (error, tmp_error);
1503 truncate_partial (context);
1507 case STATE_AFTER_CLOSE_TAG_SLASH:
1508 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1509 if (is_name_start_char (context->iter))
1511 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1513 /* start of tag name */
1514 context->start = context->iter;
1522 G_MARKUP_ERROR_PARSE,
1523 _("'%s' is not a valid character following "
1524 "the characters '</'; '%s' may not begin an "
1526 utf8_str (context->iter, buf),
1527 utf8_str (context->iter, buf));
1531 case STATE_INSIDE_CLOSE_TAG_NAME:
1532 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1533 advance_to_name_end (context);
1534 add_to_partial (context, context->start, context->iter);
1536 if (context->iter != context->current_text_end)
1537 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1540 case STATE_AFTER_CLOSE_TAG_NAME:
1541 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1543 skip_spaces (context);
1545 if (context->iter != context->current_text_end)
1549 /* The name has ended. Combine it with the partial chunk
1550 * if any; check that it matches stack top and pop
1551 * stack; invoke proper callback; enter next state.
1553 close_name = g_string_free (context->partial_chunk, FALSE);
1554 context->partial_chunk = NULL;
1556 if (*context->iter != '>')
1562 G_MARKUP_ERROR_PARSE,
1563 _("'%s' is not a valid character following "
1564 "the close element name '%s'; the allowed "
1565 "character is '>'"),
1566 utf8_str (context->iter, buf),
1569 else if (context->tag_stack == NULL)
1573 G_MARKUP_ERROR_PARSE,
1574 _("Element '%s' was closed, no element "
1575 "is currently open"),
1578 else if (strcmp (close_name, current_element (context)) != 0)
1582 G_MARKUP_ERROR_PARSE,
1583 _("Element '%s' was closed, but the currently "
1584 "open element is '%s'"),
1586 current_element (context));
1591 advance_char (context);
1592 context->state = STATE_AFTER_CLOSE_ANGLE;
1593 context->start = NULL;
1595 /* call the end_element callback */
1597 if (context->parser->end_element)
1598 (* context->parser->end_element) (context,
1604 /* Pop the tag stack */
1605 g_free (context->tag_stack->data);
1606 context->tag_stack = g_slist_delete_link (context->tag_stack,
1607 context->tag_stack);
1611 mark_error (context, tmp_error);
1612 g_propagate_error (error, tmp_error);
1616 g_free (close_name);
1620 case STATE_INSIDE_PASSTHROUGH:
1621 /* Possible next state: AFTER_CLOSE_ANGLE */
1624 if (*context->iter == '<')
1626 if (*context->iter == '>')
1629 add_to_partial (context, context->start, context->iter);
1630 context->start = context->iter;
1631 if ((g_str_has_prefix (context->partial_chunk->str, "<?")
1632 && g_str_has_suffix (context->partial_chunk->str, "?")) ||
1633 (g_str_has_prefix (context->partial_chunk->str, "<!--")
1634 && g_str_has_suffix (context->partial_chunk->str, "--")) ||
1635 (g_str_has_prefix (context->partial_chunk->str, "<![CDATA[")
1636 && g_str_has_suffix (context->partial_chunk->str, "]]")) ||
1637 (g_str_has_prefix (context->partial_chunk->str, "<!DOCTYPE")
1638 && context->balance == 0))
1642 while (advance_char (context));
1644 if (context->iter == context->current_text_end)
1646 /* The passthrough hasn't necessarily ended. Merge with
1647 * partial chunk, leave state unchanged.
1649 add_to_partial (context, context->start, context->iter);
1653 /* The passthrough has ended at the close angle. Combine
1654 * it with the partial chunk if any. Call the passthrough
1655 * callback. Note that the open/close angles are
1656 * included in the text of the passthrough.
1658 GError *tmp_error = NULL;
1660 advance_char (context); /* advance past close angle */
1661 add_to_partial (context, context->start, context->iter);
1663 if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT &&
1664 g_str_has_prefix (context->partial_chunk->str, "<![CDATA[") &&
1665 g_str_has_suffix (context->partial_chunk->str, "]]>"))
1667 if (context->parser->text)
1668 (*context->parser->text) (context,
1669 context->partial_chunk->str + strlen ("<![CDATA["),
1670 context->partial_chunk->len - strlen ("<![CDATA[" "]]>"),
1674 else if (context->parser->passthrough)
1675 (*context->parser->passthrough) (context,
1676 context->partial_chunk->str,
1677 context->partial_chunk->len,
1681 truncate_partial (context);
1683 if (tmp_error == NULL)
1685 context->state = STATE_AFTER_CLOSE_ANGLE;
1686 context->start = context->iter; /* could begin text */
1690 mark_error (context, tmp_error);
1691 g_propagate_error (error, tmp_error);
1701 g_assert_not_reached ();
1707 context->parsing = FALSE;
1709 return context->state != STATE_ERROR;
1713 * g_markup_parse_context_end_parse:
1714 * @context: a #GMarkupParseContext
1715 * @error: return location for a #GError
1717 * Signals to the #GMarkupParseContext that all data has been
1718 * fed into the parse context with g_markup_parse_context_parse().
1719 * This function reports an error if the document isn't complete,
1720 * for example if elements are still open.
1722 * Return value: %TRUE on success, %FALSE if an error was set
1725 g_markup_parse_context_end_parse (GMarkupParseContext *context,
1728 g_return_val_if_fail (context != NULL, FALSE);
1729 g_return_val_if_fail (!context->parsing, FALSE);
1730 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
1732 if (context->partial_chunk != NULL)
1734 g_string_free (context->partial_chunk, TRUE);
1735 context->partial_chunk = NULL;
1738 if (context->document_empty)
1740 set_error (context, error, G_MARKUP_ERROR_EMPTY,
1741 _("Document was empty or contained only whitespace"));
1745 context->parsing = TRUE;
1747 switch (context->state)
1753 case STATE_AFTER_OPEN_ANGLE:
1754 set_error (context, error, G_MARKUP_ERROR_PARSE,
1755 _("Document ended unexpectedly just after an open angle bracket '<'"));
1758 case STATE_AFTER_CLOSE_ANGLE:
1759 if (context->tag_stack != NULL)
1761 /* Error message the same as for INSIDE_TEXT */
1762 set_error (context, error, G_MARKUP_ERROR_PARSE,
1763 _("Document ended unexpectedly with elements still open - "
1764 "'%s' was the last element opened"),
1765 current_element (context));
1769 case STATE_AFTER_ELISION_SLASH:
1770 set_error (context, error, G_MARKUP_ERROR_PARSE,
1771 _("Document ended unexpectedly, expected to see a close angle "
1772 "bracket ending the tag <%s/>"), current_element (context));
1775 case STATE_INSIDE_OPEN_TAG_NAME:
1776 set_error (context, error, G_MARKUP_ERROR_PARSE,
1777 _("Document ended unexpectedly inside an element name"));
1780 case STATE_INSIDE_ATTRIBUTE_NAME:
1781 set_error (context, error, G_MARKUP_ERROR_PARSE,
1782 _("Document ended unexpectedly inside an attribute name"));
1785 case STATE_BETWEEN_ATTRIBUTES:
1786 set_error (context, error, G_MARKUP_ERROR_PARSE,
1787 _("Document ended unexpectedly inside an element-opening "
1791 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1792 set_error (context, error, G_MARKUP_ERROR_PARSE,
1793 _("Document ended unexpectedly after the equals sign "
1794 "following an attribute name; no attribute value"));
1797 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1798 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1799 set_error (context, error, G_MARKUP_ERROR_PARSE,
1800 _("Document ended unexpectedly while inside an attribute "
1804 case STATE_INSIDE_TEXT:
1805 g_assert (context->tag_stack != NULL);
1806 set_error (context, error, G_MARKUP_ERROR_PARSE,
1807 _("Document ended unexpectedly with elements still open - "
1808 "'%s' was the last element opened"),
1809 current_element (context));
1812 case STATE_AFTER_CLOSE_TAG_SLASH:
1813 case STATE_INSIDE_CLOSE_TAG_NAME:
1814 set_error (context, error, G_MARKUP_ERROR_PARSE,
1815 _("Document ended unexpectedly inside the close tag for "
1816 "element '%s'"), current_element (context));
1819 case STATE_INSIDE_PASSTHROUGH:
1820 set_error (context, error, G_MARKUP_ERROR_PARSE,
1821 _("Document ended unexpectedly inside a comment or "
1822 "processing instruction"));
1827 g_assert_not_reached ();
1831 context->parsing = FALSE;
1833 return context->state != STATE_ERROR;
1837 * g_markup_parse_context_get_element:
1838 * @context: a #GMarkupParseContext
1839 * @returns: the name of the currently open element, or %NULL
1841 * Retrieves the name of the currently open element.
1845 G_CONST_RETURN gchar *
1846 g_markup_parse_context_get_element (GMarkupParseContext *context)
1848 g_return_val_if_fail (context != NULL, NULL);
1850 if (context->tag_stack == NULL)
1853 return current_element (context);
1857 * g_markup_parse_context_get_position:
1858 * @context: a #GMarkupParseContext
1859 * @line_number: return location for a line number, or %NULL
1860 * @char_number: return location for a char-on-line number, or %NULL
1862 * Retrieves the current line number and the number of the character on
1863 * that line. Intended for use in error messages; there are no strict
1864 * semantics for what constitutes the "current" line number other than
1865 * "the best number we could come up with for error messages."
1869 g_markup_parse_context_get_position (GMarkupParseContext *context,
1873 g_return_if_fail (context != NULL);
1876 *line_number = context->line_number;
1879 *char_number = context->char_number;
1883 append_escaped_text (GString *str,
1891 end = text + length;
1896 next = g_utf8_next_char (p);
1901 g_string_append (str, "&");
1905 g_string_append (str, "<");
1909 g_string_append (str, ">");
1913 g_string_append (str, "'");
1917 g_string_append (str, """);
1921 g_string_append_len (str, p, next - p);
1930 * g_markup_escape_text:
1931 * @text: some valid UTF-8 text
1932 * @length: length of @text in bytes, or -1 if the text is nul-terminated
1934 * Escapes text so that the markup parser will parse it verbatim.
1935 * Less than, greater than, ampersand, etc. are replaced with the
1936 * corresponding entities. This function would typically be used
1937 * when writing out a file to be parsed with the markup parser.
1939 * Note that this function doesn't protect whitespace and line endings
1940 * from being processed according to the XML rules for normalization
1941 * of line endings and attribute values.
1943 * Return value: a newly allocated string with the escaped text
1946 g_markup_escape_text (const gchar *text,
1951 g_return_val_if_fail (text != NULL, NULL);
1954 length = strlen (text);
1956 /* prealloc at least as long as original text */
1957 str = g_string_sized_new (length);
1958 append_escaped_text (str, text, length);
1960 return g_string_free (str, FALSE);
1965 * @format: a printf-style format string
1966 * @after: location to store a pointer to the character after
1967 * the returned conversion. On a %NULL return, returns the
1968 * pointer to the trailing NUL in the string
1970 * Find the next conversion in a printf-style format string.
1971 * Partially based on code from printf-parser.c,
1972 * Copyright (C) 1999-2000, 2002-2003 Free Software Foundation, Inc.
1974 * Return value: pointer to the next conversion in @format,
1975 * or %NULL, if none.
1978 find_conversion (const char *format,
1981 const char *start = format;
1984 while (*start != '\0' && *start != '%')
2001 /* Test for positional argument. */
2002 if (*cp >= '0' && *cp <= '9')
2006 for (np = cp; *np >= '0' && *np <= '9'; np++)
2012 /* Skip the flags. */
2026 /* Skip the field width. */
2031 /* Test for positional argument. */
2032 if (*cp >= '0' && *cp <= '9')
2036 for (np = cp; *np >= '0' && *np <= '9'; np++)
2044 for (; *cp >= '0' && *cp <= '9'; cp++)
2048 /* Skip the precision. */
2054 /* Test for positional argument. */
2055 if (*cp >= '0' && *cp <= '9')
2059 for (np = cp; *np >= '0' && *np <= '9'; np++)
2067 for (; *cp >= '0' && *cp <= '9'; cp++)
2072 /* Skip argument type/size specifiers. */
2073 while (*cp == 'h' ||
2082 /* Skip the conversion character. */
2090 * g_markup_vprintf_escaped:
2091 * @format: printf() style format string
2092 * @args: variable argument list, similar to vprintf()
2094 * Formats the data in @args according to @format, escaping
2095 * all string and character arguments in the fashion
2096 * of g_markup_escape_text(). See g_markup_printf_escaped().
2098 * Return value: newly allocated result from formatting
2099 * operation. Free with g_free().
2104 g_markup_vprintf_escaped (const char *format,
2109 GString *result = NULL;
2110 gchar *output1 = NULL;
2111 gchar *output2 = NULL;
2112 const char *p, *op1, *op2;
2115 /* The technique here, is that we make two format strings that
2116 * have the identical conversions in the identical order to the
2117 * original strings, but differ in the text in-between. We
2118 * then use the normal g_strdup_vprintf() to format the arguments
2119 * with the two new format strings. By comparing the results,
2120 * we can figure out what segments of the output come from
2121 * the the original format string, and what from the arguments,
2122 * and thus know what portions of the string to escape.
2124 * For instance, for:
2126 * g_markup_printf_escaped ("%s ate %d apples", "Susan & Fred", 5);
2128 * We form the two format strings "%sX%dX" and %sY%sY". The results
2129 * of formatting with those two strings are
2131 * "%sX%dX" => "Susan & FredX5X"
2132 * "%sY%dY" => "Susan & FredY5Y"
2134 * To find the span of the first argument, we find the first position
2135 * where the two arguments differ, which tells us that the first
2136 * argument formatted to "Susan & Fred". We then escape that
2137 * to "Susan & Fred" and join up with the intermediate portions
2138 * of the format string and the second argument to get
2139 * "Susan & Fred ate 5 apples".
2142 /* Create the two modified format strings
2144 format1 = g_string_new (NULL);
2145 format2 = g_string_new (NULL);
2150 const char *conv = find_conversion (p, &after);
2154 g_string_append_len (format1, conv, after - conv);
2155 g_string_append_c (format1, 'X');
2156 g_string_append_len (format2, conv, after - conv);
2157 g_string_append_c (format2, 'Y');
2162 /* Use them to format the arguments
2164 G_VA_COPY (args2, args);
2166 output1 = g_strdup_vprintf (format1->str, args);
2171 output2 = g_strdup_vprintf (format2->str, args2);
2176 result = g_string_new (NULL);
2178 /* Iterate through the original format string again,
2179 * copying the non-conversion portions and the escaped
2180 * converted arguments to the output string.
2188 const char *output_start;
2189 const char *conv = find_conversion (p, &after);
2192 if (!conv) /* The end, after points to the trailing \0 */
2194 g_string_append_len (result, p, after - p);
2198 g_string_append_len (result, p, conv - p);
2200 while (*op1 == *op2)
2206 escaped = g_markup_escape_text (output_start, op1 - output_start);
2207 g_string_append (result, escaped);
2216 g_string_free (format1, TRUE);
2217 g_string_free (format2, TRUE);
2222 return g_string_free (result, FALSE);
2228 * g_markup_printf_escaped:
2229 * @format: printf() style format string
2230 * @Varargs: the arguments to insert in the format string
2232 * Formats arguments according to @format, escaping
2233 * all string and character arguments in the fashion
2234 * of g_markup_escape_text(). This is useful when you
2235 * want to insert literal strings into XML-style markup
2236 * output, without having to worry that the strings
2237 * might themselves contain markup.
2239 * <informalexample><programlisting>
2240 * const char *store = "Fortnum & Mason";
2241 * const char *item = "Tea";
2244 * output = g_markup_printf_escaped ("<purchase>"
2245 * "<store>%s</store>"
2246 * "<item>%s</item>"
2247 * "</purchase>",
2249 * </programlisting></informalexample>
2251 * Return value: newly allocated result from formatting
2252 * operation. Free with g_free().
2257 g_markup_printf_escaped (const char *format, ...)
2262 va_start (args, format);
2263 result = g_markup_vprintf_escaped (format, args);
2269 #define __G_MARKUP_C__
2270 #include "galiasdef.c"