1 /* gmarkup.c - Simple XML-like parser
3 * Copyright 2000, 2003 Red Hat, Inc.
5 * GLib is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * GLib is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with GLib; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
34 g_markup_error_quark (void)
36 return g_quark_from_static_string ("g-markup-error-quark");
42 STATE_AFTER_OPEN_ANGLE,
43 STATE_AFTER_CLOSE_ANGLE,
44 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
45 STATE_INSIDE_OPEN_TAG_NAME,
46 STATE_INSIDE_ATTRIBUTE_NAME,
47 STATE_AFTER_ATTRIBUTE_NAME,
48 STATE_BETWEEN_ATTRIBUTES,
49 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
50 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
51 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
53 STATE_AFTER_CLOSE_TAG_SLASH,
54 STATE_INSIDE_CLOSE_TAG_NAME,
55 STATE_AFTER_CLOSE_TAG_NAME,
56 STATE_INSIDE_PASSTHROUGH,
60 struct _GMarkupParseContext
62 const GMarkupParser *parser;
64 GMarkupParseFlags flags;
70 GDestroyNotify dnotify;
72 /* A piece of character data or an element that
73 * hasn't "ended" yet so we haven't yet called
74 * the callback for it.
76 GString *partial_chunk;
78 GMarkupParseState state;
85 const gchar *current_text;
86 gssize current_text_len;
87 const gchar *current_text_end;
89 GString *leftover_char_portion;
91 /* used to save the start of the last interesting thingy */
96 guint document_empty : 1;
102 * g_markup_parse_context_new:
103 * @parser: a #GMarkupParser
104 * @flags: one or more #GMarkupParseFlags
105 * @user_data: user data to pass to #GMarkupParser functions
106 * @user_data_dnotify: user data destroy notifier called when the parse context is freed
108 * Creates a new parse context. A parse context is used to parse
109 * marked-up documents. You can feed any number of documents into
110 * a context, as long as no errors occur; once an error occurs,
111 * the parse context can't continue to parse text (you have to free it
112 * and create a new parse context).
114 * Return value: a new #GMarkupParseContext
116 GMarkupParseContext *
117 g_markup_parse_context_new (const GMarkupParser *parser,
118 GMarkupParseFlags flags,
120 GDestroyNotify user_data_dnotify)
122 GMarkupParseContext *context;
124 g_return_val_if_fail (parser != NULL, NULL);
126 context = g_new (GMarkupParseContext, 1);
128 context->parser = parser;
129 context->flags = flags;
130 context->user_data = user_data;
131 context->dnotify = user_data_dnotify;
133 context->line_number = 1;
134 context->char_number = 1;
136 context->partial_chunk = NULL;
138 context->state = STATE_START;
139 context->tag_stack = NULL;
140 context->attr_names = NULL;
141 context->attr_values = NULL;
142 context->cur_attr = -1;
143 context->alloc_attrs = 0;
145 context->current_text = NULL;
146 context->current_text_len = -1;
147 context->current_text_end = NULL;
148 context->leftover_char_portion = NULL;
150 context->start = NULL;
151 context->iter = NULL;
153 context->document_empty = TRUE;
154 context->parsing = FALSE;
156 context->balance = 0;
162 * g_markup_parse_context_free:
163 * @context: a #GMarkupParseContext
165 * Frees a #GMarkupParseContext. Can't be called from inside
166 * one of the #GMarkupParser functions.
170 g_markup_parse_context_free (GMarkupParseContext *context)
172 g_return_if_fail (context != NULL);
173 g_return_if_fail (!context->parsing);
175 if (context->dnotify)
176 (* context->dnotify) (context->user_data);
178 g_strfreev (context->attr_names);
179 g_strfreev (context->attr_values);
181 g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
182 g_slist_free (context->tag_stack);
184 if (context->partial_chunk)
185 g_string_free (context->partial_chunk, TRUE);
187 if (context->leftover_char_portion)
188 g_string_free (context->leftover_char_portion, TRUE);
194 mark_error (GMarkupParseContext *context,
197 context->state = STATE_ERROR;
199 if (context->parser->error)
200 (*context->parser->error) (context, error, context->user_data);
203 static void set_error (GMarkupParseContext *context,
207 ...) G_GNUC_PRINTF (4, 5);
210 set_error (GMarkupParseContext *context,
220 va_start (args, format);
221 s = g_strdup_vprintf (format, args);
224 tmp_error = g_error_new (G_MARKUP_ERROR,
226 _("Error on line %d char %d: %s"),
227 context->line_number,
228 context->char_number,
233 mark_error (context, tmp_error);
235 g_propagate_error (error, tmp_error);
239 /* To make these faster, we first use the ascii-only tests, then check
240 * for the usual non-alnum name-end chars, and only then call the
241 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
242 * names, so this is a reasonable hack that virtually always avoids
245 #define IS_COMMON_NAME_END_CHAR(c) \
246 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
249 is_name_start_char (const gchar *p)
251 if (g_ascii_isalpha (*p) ||
252 (!IS_COMMON_NAME_END_CHAR (*p) &&
255 g_unichar_isalpha (g_utf8_get_char (p)))))
262 is_name_char (const gchar *p)
264 if (g_ascii_isalnum (*p) ||
265 (!IS_COMMON_NAME_END_CHAR (*p) &&
270 g_unichar_isalpha (g_utf8_get_char (p)))))
278 char_str (gunichar c,
282 g_unichar_to_utf8 (c, buf);
287 utf8_str (const gchar *utf8,
290 char_str (g_utf8_get_char (utf8), buf);
295 set_unescape_error (GMarkupParseContext *context,
297 const gchar *remaining_text,
298 const gchar *remaining_text_end,
306 gint remaining_newlines;
309 remaining_newlines = 0;
311 while (p != remaining_text_end)
314 ++remaining_newlines;
318 va_start (args, format);
319 s = g_strdup_vprintf (format, args);
322 tmp_error = g_error_new (G_MARKUP_ERROR,
324 _("Error on line %d: %s"),
325 context->line_number - remaining_newlines,
330 mark_error (context, tmp_error);
332 g_propagate_error (error, tmp_error);
338 USTATE_AFTER_AMPERSAND,
339 USTATE_INSIDE_ENTITY_NAME,
340 USTATE_AFTER_CHARREF_HASH
345 GMarkupParseContext *context;
349 const gchar *text_end;
350 const gchar *entity_start;
354 unescape_text_state_inside_text (UnescapeContext *ucontext,
359 gboolean normalize_attribute;
361 if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
362 ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
363 normalize_attribute = TRUE;
365 normalize_attribute = FALSE;
369 while (p != ucontext->text_end)
375 else if (normalize_attribute && (*p == '\t' || *p == '\n'))
377 g_string_append_len (ucontext->str, start, p - start);
378 g_string_append_c (ucontext->str, ' ');
379 p = g_utf8_next_char (p);
384 g_string_append_len (ucontext->str, start, p - start);
385 g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
386 p = g_utf8_next_char (p);
387 if (p != ucontext->text_end && *p == '\n')
388 p = g_utf8_next_char (p);
392 p = g_utf8_next_char (p);
396 g_string_append_len (ucontext->str, start, p - start);
398 if (p != ucontext->text_end && *p == '&')
400 p = g_utf8_next_char (p);
401 ucontext->state = USTATE_AFTER_AMPERSAND;
408 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
412 ucontext->entity_start = NULL;
416 p = g_utf8_next_char (p);
418 ucontext->entity_start = p;
419 ucontext->state = USTATE_AFTER_CHARREF_HASH;
421 else if (!is_name_start_char (p))
425 set_unescape_error (ucontext->context, error,
426 p, ucontext->text_end,
427 G_MARKUP_ERROR_PARSE,
428 _("Empty entity '&;' seen; valid "
429 "entities are: & " < > '"));
435 set_unescape_error (ucontext->context, error,
436 p, ucontext->text_end,
437 G_MARKUP_ERROR_PARSE,
438 _("Character '%s' is not valid at "
439 "the start of an entity name; "
440 "the & character begins an entity; "
441 "if this ampersand isn't supposed "
442 "to be an entity, escape it as "
449 ucontext->entity_start = p;
450 ucontext->state = USTATE_INSIDE_ENTITY_NAME;
457 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
461 while (p != ucontext->text_end)
465 else if (!is_name_char (p))
469 set_unescape_error (ucontext->context, error,
470 p, ucontext->text_end,
471 G_MARKUP_ERROR_PARSE,
472 _("Character '%s' is not valid "
473 "inside an entity name"),
478 p = g_utf8_next_char (p);
481 if (ucontext->context->state != STATE_ERROR)
483 if (p != ucontext->text_end)
485 gint len = p - ucontext->entity_start;
487 /* move to after semicolon */
488 p = g_utf8_next_char (p);
489 ucontext->state = USTATE_INSIDE_TEXT;
491 if (strncmp (ucontext->entity_start, "lt", len) == 0)
492 g_string_append_c (ucontext->str, '<');
493 else if (strncmp (ucontext->entity_start, "gt", len) == 0)
494 g_string_append_c (ucontext->str, '>');
495 else if (strncmp (ucontext->entity_start, "amp", len) == 0)
496 g_string_append_c (ucontext->str, '&');
497 else if (strncmp (ucontext->entity_start, "quot", len) == 0)
498 g_string_append_c (ucontext->str, '"');
499 else if (strncmp (ucontext->entity_start, "apos", len) == 0)
500 g_string_append_c (ucontext->str, '\'');
505 name = g_strndup (ucontext->entity_start, len);
506 set_unescape_error (ucontext->context, error,
507 p, ucontext->text_end,
508 G_MARKUP_ERROR_PARSE,
509 _("Entity name '%s' is not known"),
516 set_unescape_error (ucontext->context, error,
517 /* give line number of the & */
518 ucontext->entity_start, ucontext->text_end,
519 G_MARKUP_ERROR_PARSE,
520 _("Entity did not end with a semicolon; "
521 "most likely you used an ampersand "
522 "character without intending to start "
523 "an entity - escape ampersand as &"));
532 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
536 gboolean is_hex = FALSE;
539 start = ucontext->entity_start;
544 p = g_utf8_next_char (p);
548 while (p != ucontext->text_end && *p != ';')
549 p = g_utf8_next_char (p);
551 if (p != ucontext->text_end)
553 g_assert (*p == ';');
555 /* digit is between start and p */
564 l = strtoul (start, &end, 16);
566 l = strtoul (start, &end, 10);
568 if (end != p || errno != 0)
570 set_unescape_error (ucontext->context, error,
571 start, ucontext->text_end,
572 G_MARKUP_ERROR_PARSE,
573 _("Failed to parse '%-.*s', which "
574 "should have been a digit "
575 "inside a character reference "
576 "(ê for example) - perhaps "
577 "the digit is too large"),
582 /* characters XML permits */
586 (l >= 0x20 && l <= 0xD7FF) ||
587 (l >= 0xE000 && l <= 0xFFFD) ||
588 (l >= 0x10000 && l <= 0x10FFFF))
591 g_string_append (ucontext->str, char_str (l, buf));
595 set_unescape_error (ucontext->context, error,
596 start, ucontext->text_end,
597 G_MARKUP_ERROR_PARSE,
598 _("Character reference '%-.*s' does not "
599 "encode a permitted character"),
604 /* Move to next state */
605 p = g_utf8_next_char (p); /* past semicolon */
606 ucontext->state = USTATE_INSIDE_TEXT;
610 set_unescape_error (ucontext->context, error,
611 start, ucontext->text_end,
612 G_MARKUP_ERROR_PARSE,
613 _("Empty character reference; "
614 "should include a digit such as "
620 set_unescape_error (ucontext->context, error,
621 start, ucontext->text_end,
622 G_MARKUP_ERROR_PARSE,
623 _("Character reference did not end with a "
625 "most likely you used an ampersand "
626 "character without intending to start "
627 "an entity - escape ampersand as &"));
634 unescape_text (GMarkupParseContext *context,
636 const gchar *text_end,
640 UnescapeContext ucontext;
643 ucontext.context = context;
644 ucontext.text = text;
645 ucontext.text_end = text_end;
646 ucontext.entity_start = NULL;
648 ucontext.str = g_string_sized_new (text_end - text);
650 ucontext.state = USTATE_INSIDE_TEXT;
653 while (p != text_end && context->state != STATE_ERROR)
655 g_assert (p < text_end);
657 switch (ucontext.state)
659 case USTATE_INSIDE_TEXT:
661 p = unescape_text_state_inside_text (&ucontext,
667 case USTATE_AFTER_AMPERSAND:
669 p = unescape_text_state_after_ampersand (&ucontext,
676 case USTATE_INSIDE_ENTITY_NAME:
678 p = unescape_text_state_inside_entity_name (&ucontext,
684 case USTATE_AFTER_CHARREF_HASH:
686 p = unescape_text_state_after_charref_hash (&ucontext,
693 g_assert_not_reached ();
698 if (context->state != STATE_ERROR)
700 switch (ucontext.state)
702 case USTATE_INSIDE_TEXT:
704 case USTATE_AFTER_AMPERSAND:
705 case USTATE_INSIDE_ENTITY_NAME:
706 set_unescape_error (context, error,
708 G_MARKUP_ERROR_PARSE,
709 _("Unfinished entity reference"));
711 case USTATE_AFTER_CHARREF_HASH:
712 set_unescape_error (context, error,
714 G_MARKUP_ERROR_PARSE,
715 _("Unfinished character reference"));
720 if (context->state == STATE_ERROR)
722 g_string_free (ucontext.str, TRUE);
728 *unescaped = ucontext.str;
733 static inline gboolean
734 advance_char (GMarkupParseContext *context)
736 context->iter = g_utf8_next_char (context->iter);
737 context->char_number += 1;
739 if (context->iter == context->current_text_end)
743 else if (*context->iter == '\n')
745 context->line_number += 1;
746 context->char_number = 1;
752 static inline gboolean
755 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
759 skip_spaces (GMarkupParseContext *context)
763 if (!xml_isspace (*context->iter))
766 while (advance_char (context));
770 advance_to_name_end (GMarkupParseContext *context)
774 if (!is_name_char (context->iter))
777 while (advance_char (context));
781 add_to_partial (GMarkupParseContext *context,
782 const gchar *text_start,
783 const gchar *text_end)
785 if (context->partial_chunk == NULL)
786 context->partial_chunk = g_string_sized_new (text_end - text_start);
788 if (text_start != text_end)
789 g_string_append_len (context->partial_chunk, text_start,
790 text_end - text_start);
792 /* Invariant here that partial_chunk exists */
796 truncate_partial (GMarkupParseContext *context)
798 if (context->partial_chunk != NULL)
800 context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
805 current_element (GMarkupParseContext *context)
807 return context->tag_stack->data;
811 current_attribute (GMarkupParseContext *context)
813 g_assert (context->cur_attr >= 0);
814 return context->attr_names[context->cur_attr];
818 find_current_text_end (GMarkupParseContext *context)
820 /* This function must be safe (non-segfaulting) on invalid UTF8.
821 * It assumes the string starts with a character start
823 const gchar *end = context->current_text + context->current_text_len;
827 g_assert (context->current_text_len > 0);
829 p = g_utf8_find_prev_char (context->current_text, end);
831 g_assert (p != NULL); /* since current_text was a char start */
833 /* p is now the start of the last character or character portion. */
835 next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
839 /* whole character */
840 context->current_text_end = end;
845 context->leftover_char_portion = g_string_new_len (p, end - p);
846 context->current_text_len -= (end - p);
847 context->current_text_end = p;
853 add_attribute (GMarkupParseContext *context, char *name)
855 if (context->cur_attr + 2 >= context->alloc_attrs)
857 context->alloc_attrs += 5; /* silly magic number */
858 context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
859 context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
862 context->attr_names[context->cur_attr] = name;
863 context->attr_values[context->cur_attr] = NULL;
864 context->attr_names[context->cur_attr+1] = NULL;
865 context->attr_values[context->cur_attr+1] = NULL;
869 * g_markup_parse_context_parse:
870 * @context: a #GMarkupParseContext
871 * @text: chunk of text to parse
872 * @text_len: length of @text in bytes
873 * @error: return location for a #GError
875 * Feed some data to the #GMarkupParseContext. The data need not
876 * be valid UTF-8; an error will be signaled if it's invalid.
877 * The data need not be an entire document; you can feed a document
878 * into the parser incrementally, via multiple calls to this function.
879 * Typically, as you receive data from a network connection or file,
880 * you feed each received chunk of data into this function, aborting
881 * the process if an error occurs. Once an error is reported, no further
882 * data may be fed to the #GMarkupParseContext; all errors are fatal.
884 * Return value: %FALSE if an error occurred, %TRUE on success
887 g_markup_parse_context_parse (GMarkupParseContext *context,
892 const gchar *first_invalid;
894 g_return_val_if_fail (context != NULL, FALSE);
895 g_return_val_if_fail (text != NULL, FALSE);
896 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
897 g_return_val_if_fail (!context->parsing, FALSE);
900 text_len = strlen (text);
905 context->parsing = TRUE;
907 if (context->leftover_char_portion)
909 const gchar *first_char;
911 if ((*text & 0xc0) != 0x80)
914 first_char = g_utf8_find_next_char (text, text + text_len);
918 /* leftover_char_portion was completed. Parse it. */
919 GString *portion = context->leftover_char_portion;
921 g_string_append_len (context->leftover_char_portion,
922 text, first_char - text);
924 /* hacks to allow recursion */
925 context->parsing = FALSE;
926 context->leftover_char_portion = NULL;
928 if (!g_markup_parse_context_parse (context,
929 portion->str, portion->len,
932 g_assert (context->state == STATE_ERROR);
935 g_string_free (portion, TRUE);
936 context->parsing = TRUE;
938 /* Skip the fraction of char that was in this text */
939 text_len -= (first_char - text);
944 /* another little chunk of the leftover char; geez
945 * someone is inefficient.
947 g_string_append_len (context->leftover_char_portion,
950 if (context->leftover_char_portion->len > 7)
952 /* The leftover char portion is too big to be
957 G_MARKUP_ERROR_BAD_UTF8,
958 _("Invalid UTF-8 encoded text"));
965 context->current_text = text;
966 context->current_text_len = text_len;
967 context->iter = context->current_text;
968 context->start = context->iter;
970 /* Nothing left after finishing the leftover char, or nothing
971 * passed in to begin with.
973 if (context->current_text_len == 0)
976 /* find_current_text_end () assumes the string starts at
977 * a character start, so we need to validate at least
978 * that much. It doesn't assume any following bytes
981 if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
985 G_MARKUP_ERROR_BAD_UTF8,
986 _("Invalid UTF-8 encoded text"));
990 /* Initialize context->current_text_end, possibly adjusting
991 * current_text_len, and add any leftover char portion
993 find_current_text_end (context);
995 /* Validate UTF8 (must be done after we find the end, since
996 * we could have a trailing incomplete char)
998 if (!g_utf8_validate (context->current_text,
999 context->current_text_len,
1004 p = context->current_text;
1005 while (p != context->current_text_end)
1012 context->line_number += newlines;
1016 G_MARKUP_ERROR_BAD_UTF8,
1017 _("Invalid UTF-8 encoded text"));
1021 while (context->iter != context->current_text_end)
1023 switch (context->state)
1026 /* Possible next state: AFTER_OPEN_ANGLE */
1028 g_assert (context->tag_stack == NULL);
1030 /* whitespace is ignored outside of any elements */
1031 skip_spaces (context);
1033 if (context->iter != context->current_text_end)
1035 if (*context->iter == '<')
1037 /* Move after the open angle */
1038 advance_char (context);
1040 context->state = STATE_AFTER_OPEN_ANGLE;
1042 /* this could start a passthrough */
1043 context->start = context->iter;
1045 /* document is now non-empty */
1046 context->document_empty = FALSE;
1052 G_MARKUP_ERROR_PARSE,
1053 _("Document must begin with an element (e.g. <book>)"));
1058 case STATE_AFTER_OPEN_ANGLE:
1059 /* Possible next states: INSIDE_OPEN_TAG_NAME,
1060 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
1062 if (*context->iter == '?' ||
1063 *context->iter == '!')
1065 /* include < in the passthrough */
1066 const gchar *openangle = "<";
1067 add_to_partial (context, openangle, openangle + 1);
1068 context->start = context->iter;
1069 context->balance = 1;
1070 context->state = STATE_INSIDE_PASSTHROUGH;
1072 else if (*context->iter == '/')
1075 advance_char (context);
1077 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
1079 else if (is_name_start_char (context->iter))
1081 context->state = STATE_INSIDE_OPEN_TAG_NAME;
1083 /* start of tag name */
1084 context->start = context->iter;
1092 G_MARKUP_ERROR_PARSE,
1093 _("'%s' is not a valid character following "
1094 "a '<' character; it may not begin an "
1096 utf8_str (context->iter, buf));
1100 /* The AFTER_CLOSE_ANGLE state is actually sort of
1101 * broken, because it doesn't correspond to a range
1102 * of characters in the input stream as the others do,
1103 * and thus makes things harder to conceptualize
1105 case STATE_AFTER_CLOSE_ANGLE:
1106 /* Possible next states: INSIDE_TEXT, STATE_START */
1107 if (context->tag_stack == NULL)
1109 context->start = NULL;
1110 context->state = STATE_START;
1114 context->start = context->iter;
1115 context->state = STATE_INSIDE_TEXT;
1119 case STATE_AFTER_ELISION_SLASH:
1120 /* Possible next state: AFTER_CLOSE_ANGLE */
1123 /* We need to pop the tag stack and call the end_element
1124 * function, since this is the close tag
1126 GError *tmp_error = NULL;
1128 g_assert (context->tag_stack != NULL);
1131 if (context->parser->end_element)
1132 (* context->parser->end_element) (context,
1133 context->tag_stack->data,
1139 mark_error (context, tmp_error);
1140 g_propagate_error (error, tmp_error);
1144 if (*context->iter == '>')
1146 /* move after the close angle */
1147 advance_char (context);
1148 context->state = STATE_AFTER_CLOSE_ANGLE;
1156 G_MARKUP_ERROR_PARSE,
1157 _("Odd character '%s', expected a '>' character "
1158 "to end the start tag of element '%s'"),
1159 utf8_str (context->iter, buf),
1160 current_element (context));
1164 g_free (context->tag_stack->data);
1165 context->tag_stack = g_slist_delete_link (context->tag_stack,
1166 context->tag_stack);
1170 case STATE_INSIDE_OPEN_TAG_NAME:
1171 /* Possible next states: BETWEEN_ATTRIBUTES */
1173 /* if there's a partial chunk then it's the first part of the
1174 * tag name. If there's a context->start then it's the start
1175 * of the tag name in current_text, the partial chunk goes
1176 * before that start though.
1178 advance_to_name_end (context);
1180 if (context->iter == context->current_text_end)
1182 /* The name hasn't necessarily ended. Merge with
1183 * partial chunk, leave state unchanged.
1185 add_to_partial (context, context->start, context->iter);
1189 /* The name has ended. Combine it with the partial chunk
1190 * if any; push it on the stack; enter next state.
1192 add_to_partial (context, context->start, context->iter);
1193 context->tag_stack =
1194 g_slist_prepend (context->tag_stack,
1195 g_string_free (context->partial_chunk,
1198 context->partial_chunk = NULL;
1200 context->state = STATE_BETWEEN_ATTRIBUTES;
1201 context->start = NULL;
1205 case STATE_INSIDE_ATTRIBUTE_NAME:
1206 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1208 advance_to_name_end (context);
1209 add_to_partial (context, context->start, context->iter);
1211 /* read the full name, if we enter the equals sign state
1212 * then add the attribute to the list (without the value),
1213 * otherwise store a partial chunk to be prepended later.
1215 if (context->iter != context->current_text_end)
1216 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1219 case STATE_AFTER_ATTRIBUTE_NAME:
1220 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1222 skip_spaces (context);
1224 if (context->iter != context->current_text_end)
1226 /* The name has ended. Combine it with the partial chunk
1227 * if any; push it on the stack; enter next state.
1229 add_attribute (context, g_string_free (context->partial_chunk, FALSE));
1231 context->partial_chunk = NULL;
1232 context->start = NULL;
1234 if (*context->iter == '=')
1236 advance_char (context);
1237 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1245 G_MARKUP_ERROR_PARSE,
1246 _("Odd character '%s', expected a '=' after "
1247 "attribute name '%s' of element '%s'"),
1248 utf8_str (context->iter, buf),
1249 current_attribute (context),
1250 current_element (context));
1256 case STATE_BETWEEN_ATTRIBUTES:
1257 /* Possible next states: AFTER_CLOSE_ANGLE,
1258 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1260 skip_spaces (context);
1262 if (context->iter != context->current_text_end)
1264 if (*context->iter == '/')
1266 advance_char (context);
1267 context->state = STATE_AFTER_ELISION_SLASH;
1269 else if (*context->iter == '>')
1272 advance_char (context);
1273 context->state = STATE_AFTER_CLOSE_ANGLE;
1275 else if (is_name_start_char (context->iter))
1277 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1278 /* start of attribute name */
1279 context->start = context->iter;
1287 G_MARKUP_ERROR_PARSE,
1288 _("Odd character '%s', expected a '>' or '/' "
1289 "character to end the start tag of "
1290 "element '%s', or optionally an attribute; "
1291 "perhaps you used an invalid character in "
1292 "an attribute name"),
1293 utf8_str (context->iter, buf),
1294 current_element (context));
1297 /* If we're done with attributes, invoke
1298 * the start_element callback
1300 if (context->state == STATE_AFTER_ELISION_SLASH ||
1301 context->state == STATE_AFTER_CLOSE_ANGLE)
1303 const gchar *start_name;
1304 /* Ugly, but the current code expects an empty array instead of NULL */
1305 const gchar *empty = NULL;
1306 const gchar **attr_names = ∅
1307 const gchar **attr_values = ∅
1310 /* Call user callback for element start */
1311 start_name = current_element (context);
1313 if (context->cur_attr >= 0)
1315 attr_names = (const gchar**)context->attr_names;
1316 attr_values = (const gchar**)context->attr_values;
1320 if (context->parser->start_element)
1321 (* context->parser->start_element) (context,
1323 (const gchar **)attr_names,
1324 (const gchar **)attr_values,
1328 /* Go ahead and free the attributes. */
1329 for (; context->cur_attr >= 0; context->cur_attr--)
1331 int pos = context->cur_attr;
1332 g_free (context->attr_names[pos]);
1333 g_free (context->attr_values[pos]);
1334 context->attr_names[pos] = context->attr_values[pos] = NULL;
1336 g_assert (context->cur_attr == -1);
1337 g_assert (context->attr_names == NULL ||
1338 context->attr_names[0] == NULL);
1339 g_assert (context->attr_values == NULL ||
1340 context->attr_values[0] == NULL);
1342 if (tmp_error != NULL)
1344 mark_error (context, tmp_error);
1345 g_propagate_error (error, tmp_error);
1351 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1352 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1354 skip_spaces (context);
1356 if (context->iter != context->current_text_end)
1358 if (*context->iter == '"')
1360 advance_char (context);
1361 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1362 context->start = context->iter;
1364 else if (*context->iter == '\'')
1366 advance_char (context);
1367 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1368 context->start = context->iter;
1376 G_MARKUP_ERROR_PARSE,
1377 _("Odd character '%s', expected an open quote mark "
1378 "after the equals sign when giving value for "
1379 "attribute '%s' of element '%s'"),
1380 utf8_str (context->iter, buf),
1381 current_attribute (context),
1382 current_element (context));
1387 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1388 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1389 /* Possible next states: BETWEEN_ATTRIBUTES */
1393 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1404 if (*context->iter == delim)
1407 while (advance_char (context));
1409 if (context->iter == context->current_text_end)
1411 /* The value hasn't necessarily ended. Merge with
1412 * partial chunk, leave state unchanged.
1414 add_to_partial (context, context->start, context->iter);
1418 /* The value has ended at the quote mark. Combine it
1419 * with the partial chunk if any; set it for the current
1424 add_to_partial (context, context->start, context->iter);
1426 g_assert (context->cur_attr >= 0);
1428 if (unescape_text (context,
1429 context->partial_chunk->str,
1430 context->partial_chunk->str +
1431 context->partial_chunk->len,
1435 /* success, advance past quote and set state. */
1436 context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
1437 advance_char (context);
1438 context->state = STATE_BETWEEN_ATTRIBUTES;
1439 context->start = NULL;
1442 truncate_partial (context);
1446 case STATE_INSIDE_TEXT:
1447 /* Possible next states: AFTER_OPEN_ANGLE */
1450 if (*context->iter == '<')
1453 while (advance_char (context));
1455 /* The text hasn't necessarily ended. Merge with
1456 * partial chunk, leave state unchanged.
1459 add_to_partial (context, context->start, context->iter);
1461 if (context->iter != context->current_text_end)
1463 GString *unescaped = NULL;
1465 /* The text has ended at the open angle. Call the text
1469 if (unescape_text (context,
1470 context->partial_chunk->str,
1471 context->partial_chunk->str +
1472 context->partial_chunk->len,
1476 GError *tmp_error = NULL;
1478 if (context->parser->text)
1479 (*context->parser->text) (context,
1485 g_string_free (unescaped, TRUE);
1487 if (tmp_error == NULL)
1489 /* advance past open angle and set state. */
1490 advance_char (context);
1491 context->state = STATE_AFTER_OPEN_ANGLE;
1492 /* could begin a passthrough */
1493 context->start = context->iter;
1497 mark_error (context, tmp_error);
1498 g_propagate_error (error, tmp_error);
1502 truncate_partial (context);
1506 case STATE_AFTER_CLOSE_TAG_SLASH:
1507 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1508 if (is_name_start_char (context->iter))
1510 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1512 /* start of tag name */
1513 context->start = context->iter;
1521 G_MARKUP_ERROR_PARSE,
1522 _("'%s' is not a valid character following "
1523 "the characters '</'; '%s' may not begin an "
1525 utf8_str (context->iter, buf),
1526 utf8_str (context->iter, buf));
1530 case STATE_INSIDE_CLOSE_TAG_NAME:
1531 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1532 advance_to_name_end (context);
1533 add_to_partial (context, context->start, context->iter);
1535 if (context->iter != context->current_text_end)
1536 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1539 case STATE_AFTER_CLOSE_TAG_NAME:
1540 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1542 skip_spaces (context);
1544 if (context->iter != context->current_text_end)
1548 /* The name has ended. Combine it with the partial chunk
1549 * if any; check that it matches stack top and pop
1550 * stack; invoke proper callback; enter next state.
1552 close_name = g_string_free (context->partial_chunk, FALSE);
1553 context->partial_chunk = NULL;
1555 if (*context->iter != '>')
1561 G_MARKUP_ERROR_PARSE,
1562 _("'%s' is not a valid character following "
1563 "the close element name '%s'; the allowed "
1564 "character is '>'"),
1565 utf8_str (context->iter, buf),
1568 else if (context->tag_stack == NULL)
1572 G_MARKUP_ERROR_PARSE,
1573 _("Element '%s' was closed, no element "
1574 "is currently open"),
1577 else if (strcmp (close_name, current_element (context)) != 0)
1581 G_MARKUP_ERROR_PARSE,
1582 _("Element '%s' was closed, but the currently "
1583 "open element is '%s'"),
1585 current_element (context));
1590 advance_char (context);
1591 context->state = STATE_AFTER_CLOSE_ANGLE;
1592 context->start = NULL;
1594 /* call the end_element callback */
1596 if (context->parser->end_element)
1597 (* context->parser->end_element) (context,
1603 /* Pop the tag stack */
1604 g_free (context->tag_stack->data);
1605 context->tag_stack = g_slist_delete_link (context->tag_stack,
1606 context->tag_stack);
1610 mark_error (context, tmp_error);
1611 g_propagate_error (error, tmp_error);
1615 g_free (close_name);
1619 case STATE_INSIDE_PASSTHROUGH:
1620 /* Possible next state: AFTER_CLOSE_ANGLE */
1623 if (*context->iter == '<')
1625 if (*context->iter == '>')
1631 add_to_partial (context, context->start, context->iter);
1632 context->start = context->iter;
1634 str = context->partial_chunk->str;
1635 len = context->partial_chunk->len;
1637 if (str[1] == '?' && str[len - 1] == '?')
1639 if (strncmp (str, "<!--", 4) == 0 &&
1640 strcmp (str + len - 2, "--") == 0)
1642 if (strncmp (str, "<![CDATA[", 9) == 0 &&
1643 strcmp (str + len - 2, "]]") == 0)
1645 if (strncmp (str, "<!DOCTYPE", 9) == 0 &&
1646 context->balance == 0)
1650 while (advance_char (context));
1652 if (context->iter == context->current_text_end)
1654 /* The passthrough hasn't necessarily ended. Merge with
1655 * partial chunk, leave state unchanged.
1657 add_to_partial (context, context->start, context->iter);
1661 /* The passthrough has ended at the close angle. Combine
1662 * it with the partial chunk if any. Call the passthrough
1663 * callback. Note that the open/close angles are
1664 * included in the text of the passthrough.
1666 GError *tmp_error = NULL;
1668 advance_char (context); /* advance past close angle */
1669 add_to_partial (context, context->start, context->iter);
1671 if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT &&
1672 strncmp (context->partial_chunk->str, "<![CDATA[", 9) == 0)
1674 if (context->parser->text)
1675 (*context->parser->text) (context,
1676 context->partial_chunk->str + 9,
1677 context->partial_chunk->len - 12,
1681 else if (context->parser->passthrough)
1682 (*context->parser->passthrough) (context,
1683 context->partial_chunk->str,
1684 context->partial_chunk->len,
1688 truncate_partial (context);
1690 if (tmp_error == NULL)
1692 context->state = STATE_AFTER_CLOSE_ANGLE;
1693 context->start = context->iter; /* could begin text */
1697 mark_error (context, tmp_error);
1698 g_propagate_error (error, tmp_error);
1708 g_assert_not_reached ();
1714 context->parsing = FALSE;
1716 return context->state != STATE_ERROR;
1720 * g_markup_parse_context_end_parse:
1721 * @context: a #GMarkupParseContext
1722 * @error: return location for a #GError
1724 * Signals to the #GMarkupParseContext that all data has been
1725 * fed into the parse context with g_markup_parse_context_parse().
1726 * This function reports an error if the document isn't complete,
1727 * for example if elements are still open.
1729 * Return value: %TRUE on success, %FALSE if an error was set
1732 g_markup_parse_context_end_parse (GMarkupParseContext *context,
1735 g_return_val_if_fail (context != NULL, FALSE);
1736 g_return_val_if_fail (!context->parsing, FALSE);
1737 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
1739 if (context->partial_chunk != NULL)
1741 g_string_free (context->partial_chunk, TRUE);
1742 context->partial_chunk = NULL;
1745 if (context->document_empty)
1747 set_error (context, error, G_MARKUP_ERROR_EMPTY,
1748 _("Document was empty or contained only whitespace"));
1752 context->parsing = TRUE;
1754 switch (context->state)
1760 case STATE_AFTER_OPEN_ANGLE:
1761 set_error (context, error, G_MARKUP_ERROR_PARSE,
1762 _("Document ended unexpectedly just after an open angle bracket '<'"));
1765 case STATE_AFTER_CLOSE_ANGLE:
1766 if (context->tag_stack != NULL)
1768 /* Error message the same as for INSIDE_TEXT */
1769 set_error (context, error, G_MARKUP_ERROR_PARSE,
1770 _("Document ended unexpectedly with elements still open - "
1771 "'%s' was the last element opened"),
1772 current_element (context));
1776 case STATE_AFTER_ELISION_SLASH:
1777 set_error (context, error, G_MARKUP_ERROR_PARSE,
1778 _("Document ended unexpectedly, expected to see a close angle "
1779 "bracket ending the tag <%s/>"), current_element (context));
1782 case STATE_INSIDE_OPEN_TAG_NAME:
1783 set_error (context, error, G_MARKUP_ERROR_PARSE,
1784 _("Document ended unexpectedly inside an element name"));
1787 case STATE_INSIDE_ATTRIBUTE_NAME:
1788 set_error (context, error, G_MARKUP_ERROR_PARSE,
1789 _("Document ended unexpectedly inside an attribute name"));
1792 case STATE_BETWEEN_ATTRIBUTES:
1793 set_error (context, error, G_MARKUP_ERROR_PARSE,
1794 _("Document ended unexpectedly inside an element-opening "
1798 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1799 set_error (context, error, G_MARKUP_ERROR_PARSE,
1800 _("Document ended unexpectedly after the equals sign "
1801 "following an attribute name; no attribute value"));
1804 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1805 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1806 set_error (context, error, G_MARKUP_ERROR_PARSE,
1807 _("Document ended unexpectedly while inside an attribute "
1811 case STATE_INSIDE_TEXT:
1812 g_assert (context->tag_stack != NULL);
1813 set_error (context, error, G_MARKUP_ERROR_PARSE,
1814 _("Document ended unexpectedly with elements still open - "
1815 "'%s' was the last element opened"),
1816 current_element (context));
1819 case STATE_AFTER_CLOSE_TAG_SLASH:
1820 case STATE_INSIDE_CLOSE_TAG_NAME:
1821 set_error (context, error, G_MARKUP_ERROR_PARSE,
1822 _("Document ended unexpectedly inside the close tag for "
1823 "element '%s'"), current_element (context));
1826 case STATE_INSIDE_PASSTHROUGH:
1827 set_error (context, error, G_MARKUP_ERROR_PARSE,
1828 _("Document ended unexpectedly inside a comment or "
1829 "processing instruction"));
1834 g_assert_not_reached ();
1838 context->parsing = FALSE;
1840 return context->state != STATE_ERROR;
1844 * g_markup_parse_context_get_element:
1845 * @context: a #GMarkupParseContext
1846 * @returns: the name of the currently open element, or %NULL
1848 * Retrieves the name of the currently open element.
1852 G_CONST_RETURN gchar *
1853 g_markup_parse_context_get_element (GMarkupParseContext *context)
1855 g_return_val_if_fail (context != NULL, NULL);
1857 if (context->tag_stack == NULL)
1860 return current_element (context);
1864 * g_markup_parse_context_get_position:
1865 * @context: a #GMarkupParseContext
1866 * @line_number: return location for a line number, or %NULL
1867 * @char_number: return location for a char-on-line number, or %NULL
1869 * Retrieves the current line number and the number of the character on
1870 * that line. Intended for use in error messages; there are no strict
1871 * semantics for what constitutes the "current" line number other than
1872 * "the best number we could come up with for error messages."
1876 g_markup_parse_context_get_position (GMarkupParseContext *context,
1880 g_return_if_fail (context != NULL);
1883 *line_number = context->line_number;
1886 *char_number = context->char_number;
1890 append_escaped_text (GString *str,
1898 end = text + length;
1903 next = g_utf8_next_char (p);
1908 g_string_append (str, "&");
1912 g_string_append (str, "<");
1916 g_string_append (str, ">");
1920 g_string_append (str, "'");
1924 g_string_append (str, """);
1928 g_string_append_len (str, p, next - p);
1937 * g_markup_escape_text:
1938 * @text: some valid UTF-8 text
1939 * @length: length of @text in bytes, or -1 if the text is nul-terminated
1941 * Escapes text so that the markup parser will parse it verbatim.
1942 * Less than, greater than, ampersand, etc. are replaced with the
1943 * corresponding entities. This function would typically be used
1944 * when writing out a file to be parsed with the markup parser.
1946 * Note that this function doesn't protect whitespace and line endings
1947 * from being processed according to the XML rules for normalization
1948 * of line endings and attribute values.
1950 * Return value: a newly allocated string with the escaped text
1953 g_markup_escape_text (const gchar *text,
1958 g_return_val_if_fail (text != NULL, NULL);
1961 length = strlen (text);
1963 /* prealloc at least as long as original text */
1964 str = g_string_sized_new (length);
1965 append_escaped_text (str, text, length);
1967 return g_string_free (str, FALSE);
1972 * @format: a printf-style format string
1973 * @after: location to store a pointer to the character after
1974 * the returned conversion. On a %NULL return, returns the
1975 * pointer to the trailing NUL in the string
1977 * Find the next conversion in a printf-style format string.
1978 * Partially based on code from printf-parser.c,
1979 * Copyright (C) 1999-2000, 2002-2003 Free Software Foundation, Inc.
1981 * Return value: pointer to the next conversion in @format,
1982 * or %NULL, if none.
1985 find_conversion (const char *format,
1988 const char *start = format;
1991 while (*start != '\0' && *start != '%')
2008 /* Test for positional argument. */
2009 if (*cp >= '0' && *cp <= '9')
2013 for (np = cp; *np >= '0' && *np <= '9'; np++)
2019 /* Skip the flags. */
2033 /* Skip the field width. */
2038 /* Test for positional argument. */
2039 if (*cp >= '0' && *cp <= '9')
2043 for (np = cp; *np >= '0' && *np <= '9'; np++)
2051 for (; *cp >= '0' && *cp <= '9'; cp++)
2055 /* Skip the precision. */
2061 /* Test for positional argument. */
2062 if (*cp >= '0' && *cp <= '9')
2066 for (np = cp; *np >= '0' && *np <= '9'; np++)
2074 for (; *cp >= '0' && *cp <= '9'; cp++)
2079 /* Skip argument type/size specifiers. */
2080 while (*cp == 'h' ||
2089 /* Skip the conversion character. */
2097 * g_markup_vprintf_escaped:
2098 * @format: printf() style format string
2099 * @args: variable argument list, similar to vprintf()
2101 * Formats the data in @args according to @format, escaping
2102 * all string and character arguments in the fashion
2103 * of g_markup_escape_text(). See g_markup_printf_escaped().
2105 * Return value: newly allocated result from formatting
2106 * operation. Free with g_free().
2111 g_markup_vprintf_escaped (const char *format,
2116 GString *result = NULL;
2117 gchar *output1 = NULL;
2118 gchar *output2 = NULL;
2119 const char *p, *op1, *op2;
2122 /* The technique here, is that we make two format strings that
2123 * have the identical conversions in the identical order to the
2124 * original strings, but differ in the text in-between. We
2125 * then use the normal g_strdup_vprintf() to format the arguments
2126 * with the two new format strings. By comparing the results,
2127 * we can figure out what segments of the output come from
2128 * the the original format string, and what from the arguments,
2129 * and thus know what portions of the string to escape.
2131 * For instance, for:
2133 * g_markup_printf_escaped ("%s ate %d apples", "Susan & Fred", 5);
2135 * We form the two format strings "%sX%dX" and %sY%sY". The results
2136 * of formatting with those two strings are
2138 * "%sX%dX" => "Susan & FredX5X"
2139 * "%sY%dY" => "Susan & FredY5Y"
2141 * To find the span of the first argument, we find the first position
2142 * where the two arguments differ, which tells us that the first
2143 * argument formatted to "Susan & Fred". We then escape that
2144 * to "Susan & Fred" and join up with the intermediate portions
2145 * of the format string and the second argument to get
2146 * "Susan & Fred ate 5 apples".
2149 /* Create the two modified format strings
2151 format1 = g_string_new (NULL);
2152 format2 = g_string_new (NULL);
2157 const char *conv = find_conversion (p, &after);
2161 g_string_append_len (format1, conv, after - conv);
2162 g_string_append_c (format1, 'X');
2163 g_string_append_len (format2, conv, after - conv);
2164 g_string_append_c (format2, 'Y');
2169 /* Use them to format the arguments
2171 G_VA_COPY (args2, args);
2173 output1 = g_strdup_vprintf (format1->str, args);
2180 output2 = g_strdup_vprintf (format2->str, args2);
2185 result = g_string_new (NULL);
2187 /* Iterate through the original format string again,
2188 * copying the non-conversion portions and the escaped
2189 * converted arguments to the output string.
2197 const char *output_start;
2198 const char *conv = find_conversion (p, &after);
2201 if (!conv) /* The end, after points to the trailing \0 */
2203 g_string_append_len (result, p, after - p);
2207 g_string_append_len (result, p, conv - p);
2209 while (*op1 == *op2)
2215 escaped = g_markup_escape_text (output_start, op1 - output_start);
2216 g_string_append (result, escaped);
2225 g_string_free (format1, TRUE);
2226 g_string_free (format2, TRUE);
2231 return g_string_free (result, FALSE);
2237 * g_markup_printf_escaped:
2238 * @format: printf() style format string
2239 * @Varargs: the arguments to insert in the format string
2241 * Formats arguments according to @format, escaping
2242 * all string and character arguments in the fashion
2243 * of g_markup_escape_text(). This is useful when you
2244 * want to insert literal strings into XML-style markup
2245 * output, without having to worry that the strings
2246 * might themselves contain markup.
2248 * <informalexample><programlisting>
2249 * const char *store = "Fortnum & Mason";
2250 * const char *item = "Tea";
2253 * output = g_markup_printf_escaped ("<purchase>"
2254 * "<store>%s</store>"
2255 * "<item>%s</item>"
2256 * "</purchase>",
2258 * </programlisting></informalexample>
2260 * Return value: newly allocated result from formatting
2261 * operation. Free with g_free().
2266 g_markup_printf_escaped (const char *format, ...)
2271 va_start (args, format);
2272 result = g_markup_vprintf_escaped (format, args);
2278 #define __G_MARKUP_C__
2279 #include "galiasdef.c"