1 /* gmarkup.c - Simple XML-like parser
3 * Copyright 2000, 2003 Red Hat, Inc.
5 * GLib is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * GLib is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with GLib; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
34 g_markup_error_quark (void)
36 return g_quark_from_static_string ("g-markup-error-quark");
42 STATE_AFTER_OPEN_ANGLE,
43 STATE_AFTER_CLOSE_ANGLE,
44 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
45 STATE_INSIDE_OPEN_TAG_NAME,
46 STATE_INSIDE_ATTRIBUTE_NAME,
47 STATE_AFTER_ATTRIBUTE_NAME,
48 STATE_BETWEEN_ATTRIBUTES,
49 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
50 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
51 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
53 STATE_AFTER_CLOSE_TAG_SLASH,
54 STATE_INSIDE_CLOSE_TAG_NAME,
55 STATE_AFTER_CLOSE_TAG_NAME,
56 STATE_INSIDE_PASSTHROUGH,
60 struct _GMarkupParseContext
62 const GMarkupParser *parser;
64 GMarkupParseFlags flags;
70 GDestroyNotify dnotify;
72 /* A piece of character data or an element that
73 * hasn't "ended" yet so we haven't yet called
74 * the callback for it.
76 GString *partial_chunk;
78 GMarkupParseState state;
85 const gchar *current_text;
86 gssize current_text_len;
87 const gchar *current_text_end;
89 GString *leftover_char_portion;
91 /* used to save the start of the last interesting thingy */
96 guint document_empty : 1;
102 * g_markup_parse_context_new:
103 * @parser: a #GMarkupParser
104 * @flags: one or more #GMarkupParseFlags
105 * @user_data: user data to pass to #GMarkupParser functions
106 * @user_data_dnotify: user data destroy notifier called when the parse context is freed
108 * Creates a new parse context. A parse context is used to parse
109 * marked-up documents. You can feed any number of documents into
110 * a context, as long as no errors occur; once an error occurs,
111 * the parse context can't continue to parse text (you have to free it
112 * and create a new parse context).
114 * Return value: a new #GMarkupParseContext
116 GMarkupParseContext *
117 g_markup_parse_context_new (const GMarkupParser *parser,
118 GMarkupParseFlags flags,
120 GDestroyNotify user_data_dnotify)
122 GMarkupParseContext *context;
124 g_return_val_if_fail (parser != NULL, NULL);
126 context = g_new (GMarkupParseContext, 1);
128 context->parser = parser;
129 context->flags = flags;
130 context->user_data = user_data;
131 context->dnotify = user_data_dnotify;
133 context->line_number = 1;
134 context->char_number = 1;
136 context->partial_chunk = NULL;
138 context->state = STATE_START;
139 context->tag_stack = NULL;
140 context->attr_names = NULL;
141 context->attr_values = NULL;
142 context->cur_attr = -1;
143 context->alloc_attrs = 0;
145 context->current_text = NULL;
146 context->current_text_len = -1;
147 context->current_text_end = NULL;
148 context->leftover_char_portion = NULL;
150 context->start = NULL;
151 context->iter = NULL;
153 context->document_empty = TRUE;
154 context->parsing = FALSE;
156 context->balance = 0;
162 * g_markup_parse_context_free:
163 * @context: a #GMarkupParseContext
165 * Frees a #GMarkupParseContext. Can't be called from inside
166 * one of the #GMarkupParser functions.
170 g_markup_parse_context_free (GMarkupParseContext *context)
172 g_return_if_fail (context != NULL);
173 g_return_if_fail (!context->parsing);
175 if (context->dnotify)
176 (* context->dnotify) (context->user_data);
178 g_strfreev (context->attr_names);
179 g_strfreev (context->attr_values);
181 g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
182 g_slist_free (context->tag_stack);
184 if (context->partial_chunk)
185 g_string_free (context->partial_chunk, TRUE);
187 if (context->leftover_char_portion)
188 g_string_free (context->leftover_char_portion, TRUE);
194 mark_error (GMarkupParseContext *context,
197 context->state = STATE_ERROR;
199 if (context->parser->error)
200 (*context->parser->error) (context, error, context->user_data);
203 static void set_error (GMarkupParseContext *context,
207 ...) G_GNUC_PRINTF (4, 5);
210 set_error (GMarkupParseContext *context,
220 va_start (args, format);
221 s = g_strdup_vprintf (format, args);
224 tmp_error = g_error_new (G_MARKUP_ERROR,
226 _("Error on line %d char %d: %s"),
227 context->line_number,
228 context->char_number,
233 mark_error (context, tmp_error);
235 g_propagate_error (error, tmp_error);
239 /* To make these faster, we first use the ascii-only tests, then check
240 * for the usual non-alnum name-end chars, and only then call the
241 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
242 * names, so this is a reasonable hack that virtually always avoids
245 #define IS_COMMON_NAME_END_CHAR(c) \
246 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
249 is_name_start_char (const gchar *p)
251 if (g_ascii_isalpha (*p) ||
252 (!IS_COMMON_NAME_END_CHAR (*p) &&
255 g_unichar_isalpha (g_utf8_get_char (p)))))
262 is_name_char (const gchar *p)
264 if (g_ascii_isalnum (*p) ||
265 (!IS_COMMON_NAME_END_CHAR (*p) &&
270 g_unichar_isalpha (g_utf8_get_char (p)))))
278 char_str (gunichar c,
282 g_unichar_to_utf8 (c, buf);
287 utf8_str (const gchar *utf8,
290 char_str (g_utf8_get_char (utf8), buf);
295 set_unescape_error (GMarkupParseContext *context,
297 const gchar *remaining_text,
298 const gchar *remaining_text_end,
306 gint remaining_newlines;
309 remaining_newlines = 0;
311 while (p != remaining_text_end)
314 ++remaining_newlines;
318 va_start (args, format);
319 s = g_strdup_vprintf (format, args);
322 tmp_error = g_error_new (G_MARKUP_ERROR,
324 _("Error on line %d: %s"),
325 context->line_number - remaining_newlines,
330 mark_error (context, tmp_error);
332 g_propagate_error (error, tmp_error);
338 USTATE_AFTER_AMPERSAND,
339 USTATE_INSIDE_ENTITY_NAME,
340 USTATE_AFTER_CHARREF_HASH
345 GMarkupParseContext *context;
349 const gchar *text_end;
350 const gchar *entity_start;
354 unescape_text_state_inside_text (UnescapeContext *ucontext,
359 gboolean normalize_attribute;
361 if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
362 ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
363 normalize_attribute = TRUE;
365 normalize_attribute = FALSE;
369 while (p != ucontext->text_end)
375 else if (normalize_attribute && (*p == '\t' || *p == '\n'))
377 g_string_append_len (ucontext->str, start, p - start);
378 g_string_append_c (ucontext->str, ' ');
379 p = g_utf8_next_char (p);
384 g_string_append_len (ucontext->str, start, p - start);
385 g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
386 p = g_utf8_next_char (p);
387 if (p != ucontext->text_end && *p == '\n')
388 p = g_utf8_next_char (p);
392 p = g_utf8_next_char (p);
396 g_string_append_len (ucontext->str, start, p - start);
398 if (p != ucontext->text_end && *p == '&')
400 p = g_utf8_next_char (p);
401 ucontext->state = USTATE_AFTER_AMPERSAND;
408 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
412 ucontext->entity_start = NULL;
416 p = g_utf8_next_char (p);
418 ucontext->entity_start = p;
419 ucontext->state = USTATE_AFTER_CHARREF_HASH;
421 else if (!is_name_start_char (p))
425 set_unescape_error (ucontext->context, error,
426 p, ucontext->text_end,
427 G_MARKUP_ERROR_PARSE,
428 _("Empty entity '&;' seen; valid "
429 "entities are: & " < > '"));
435 set_unescape_error (ucontext->context, error,
436 p, ucontext->text_end,
437 G_MARKUP_ERROR_PARSE,
438 _("Character '%s' is not valid at "
439 "the start of an entity name; "
440 "the & character begins an entity; "
441 "if this ampersand isn't supposed "
442 "to be an entity, escape it as "
449 ucontext->entity_start = p;
450 ucontext->state = USTATE_INSIDE_ENTITY_NAME;
457 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
461 while (p != ucontext->text_end)
465 else if (!is_name_char (p))
469 set_unescape_error (ucontext->context, error,
470 p, ucontext->text_end,
471 G_MARKUP_ERROR_PARSE,
472 _("Character '%s' is not valid "
473 "inside an entity name"),
478 p = g_utf8_next_char (p);
481 if (ucontext->context->state != STATE_ERROR)
483 if (p != ucontext->text_end)
485 gint len = p - ucontext->entity_start;
487 /* move to after semicolon */
488 p = g_utf8_next_char (p);
489 ucontext->state = USTATE_INSIDE_TEXT;
491 if (strncmp (ucontext->entity_start, "lt", len) == 0)
492 g_string_append_c (ucontext->str, '<');
493 else if (strncmp (ucontext->entity_start, "gt", len) == 0)
494 g_string_append_c (ucontext->str, '>');
495 else if (strncmp (ucontext->entity_start, "amp", len) == 0)
496 g_string_append_c (ucontext->str, '&');
497 else if (strncmp (ucontext->entity_start, "quot", len) == 0)
498 g_string_append_c (ucontext->str, '"');
499 else if (strncmp (ucontext->entity_start, "apos", len) == 0)
500 g_string_append_c (ucontext->str, '\'');
505 name = g_strndup (ucontext->entity_start, len);
506 set_unescape_error (ucontext->context, error,
507 p, ucontext->text_end,
508 G_MARKUP_ERROR_PARSE,
509 _("Entity name '%s' is not known"),
516 set_unescape_error (ucontext->context, error,
517 /* give line number of the & */
518 ucontext->entity_start, ucontext->text_end,
519 G_MARKUP_ERROR_PARSE,
520 _("Entity did not end with a semicolon; "
521 "most likely you used an ampersand "
522 "character without intending to start "
523 "an entity - escape ampersand as &"));
532 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
536 gboolean is_hex = FALSE;
539 start = ucontext->entity_start;
544 p = g_utf8_next_char (p);
548 while (p != ucontext->text_end && *p != ';')
549 p = g_utf8_next_char (p);
551 if (p != ucontext->text_end)
553 g_assert (*p == ';');
555 /* digit is between start and p */
564 l = strtoul (start, &end, 16);
566 l = strtoul (start, &end, 10);
568 if (end != p || errno != 0)
570 set_unescape_error (ucontext->context, error,
571 start, ucontext->text_end,
572 G_MARKUP_ERROR_PARSE,
573 _("Failed to parse '%-.*s', which "
574 "should have been a digit "
575 "inside a character reference "
576 "(ê for example) - perhaps "
577 "the digit is too large"),
582 /* characters XML permits */
586 (l >= 0x20 && l <= 0xD7FF) ||
587 (l >= 0xE000 && l <= 0xFFFD) ||
588 (l >= 0x10000 && l <= 0x10FFFF))
591 g_string_append (ucontext->str, char_str (l, buf));
595 set_unescape_error (ucontext->context, error,
596 start, ucontext->text_end,
597 G_MARKUP_ERROR_PARSE,
598 _("Character reference '%-.*s' does not "
599 "encode a permitted character"),
604 /* Move to next state */
605 p = g_utf8_next_char (p); /* past semicolon */
606 ucontext->state = USTATE_INSIDE_TEXT;
610 set_unescape_error (ucontext->context, error,
611 start, ucontext->text_end,
612 G_MARKUP_ERROR_PARSE,
613 _("Empty character reference; "
614 "should include a digit such as "
620 set_unescape_error (ucontext->context, error,
621 start, ucontext->text_end,
622 G_MARKUP_ERROR_PARSE,
623 _("Character reference did not end with a "
625 "most likely you used an ampersand "
626 "character without intending to start "
627 "an entity - escape ampersand as &"));
634 unescape_text (GMarkupParseContext *context,
636 const gchar *text_end,
640 UnescapeContext ucontext;
643 ucontext.context = context;
644 ucontext.text = text;
645 ucontext.text_end = text_end;
646 ucontext.entity_start = NULL;
648 ucontext.str = g_string_sized_new (text_end - text);
650 ucontext.state = USTATE_INSIDE_TEXT;
653 while (p != text_end && context->state != STATE_ERROR)
655 g_assert (p < text_end);
657 switch (ucontext.state)
659 case USTATE_INSIDE_TEXT:
661 p = unescape_text_state_inside_text (&ucontext,
667 case USTATE_AFTER_AMPERSAND:
669 p = unescape_text_state_after_ampersand (&ucontext,
676 case USTATE_INSIDE_ENTITY_NAME:
678 p = unescape_text_state_inside_entity_name (&ucontext,
684 case USTATE_AFTER_CHARREF_HASH:
686 p = unescape_text_state_after_charref_hash (&ucontext,
693 g_assert_not_reached ();
698 if (context->state != STATE_ERROR)
700 switch (ucontext.state)
702 case USTATE_INSIDE_TEXT:
704 case USTATE_AFTER_AMPERSAND:
705 case USTATE_INSIDE_ENTITY_NAME:
706 set_unescape_error (context, error,
708 G_MARKUP_ERROR_PARSE,
709 _("Unfinished entity reference"));
711 case USTATE_AFTER_CHARREF_HASH:
712 set_unescape_error (context, error,
714 G_MARKUP_ERROR_PARSE,
715 _("Unfinished character reference"));
720 if (context->state == STATE_ERROR)
722 g_string_free (ucontext.str, TRUE);
728 *unescaped = ucontext.str;
733 static inline gboolean
734 advance_char (GMarkupParseContext *context)
736 context->iter = g_utf8_next_char (context->iter);
737 context->char_number += 1;
739 if (context->iter == context->current_text_end)
743 else if (*context->iter == '\n')
745 context->line_number += 1;
746 context->char_number = 1;
752 static inline gboolean
755 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
759 skip_spaces (GMarkupParseContext *context)
763 if (!xml_isspace (*context->iter))
766 while (advance_char (context));
770 advance_to_name_end (GMarkupParseContext *context)
774 if (!is_name_char (context->iter))
777 while (advance_char (context));
781 add_to_partial (GMarkupParseContext *context,
782 const gchar *text_start,
783 const gchar *text_end)
785 if (context->partial_chunk == NULL)
786 context->partial_chunk = g_string_sized_new (text_end - text_start);
788 if (text_start != text_end)
789 g_string_append_len (context->partial_chunk, text_start,
790 text_end - text_start);
792 /* Invariant here that partial_chunk exists */
796 truncate_partial (GMarkupParseContext *context)
798 if (context->partial_chunk != NULL)
800 context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
805 current_element (GMarkupParseContext *context)
807 return context->tag_stack->data;
811 current_attribute (GMarkupParseContext *context)
813 g_assert (context->cur_attr >= 0);
814 return context->attr_names[context->cur_attr];
818 find_current_text_end (GMarkupParseContext *context)
820 /* This function must be safe (non-segfaulting) on invalid UTF8.
821 * It assumes the string starts with a character start
823 const gchar *end = context->current_text + context->current_text_len;
827 g_assert (context->current_text_len > 0);
829 p = g_utf8_find_prev_char (context->current_text, end);
831 g_assert (p != NULL); /* since current_text was a char start */
833 /* p is now the start of the last character or character portion. */
835 next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
839 /* whole character */
840 context->current_text_end = end;
845 context->leftover_char_portion = g_string_new_len (p, end - p);
846 context->current_text_len -= (end - p);
847 context->current_text_end = p;
853 add_attribute (GMarkupParseContext *context, char *name)
855 if (context->cur_attr + 2 >= context->alloc_attrs)
857 context->alloc_attrs += 5; /* silly magic number */
858 context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
859 context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
862 context->attr_names[context->cur_attr] = name;
863 context->attr_values[context->cur_attr] = NULL;
864 context->attr_names[context->cur_attr+1] = NULL;
865 context->attr_values[context->cur_attr+1] = NULL;
869 * g_markup_parse_context_parse:
870 * @context: a #GMarkupParseContext
871 * @text: chunk of text to parse
872 * @text_len: length of @text in bytes
873 * @error: return location for a #GError
875 * Feed some data to the #GMarkupParseContext. The data need not
876 * be valid UTF-8; an error will be signaled if it's invalid.
877 * The data need not be an entire document; you can feed a document
878 * into the parser incrementally, via multiple calls to this function.
879 * Typically, as you receive data from a network connection or file,
880 * you feed each received chunk of data into this function, aborting
881 * the process if an error occurs. Once an error is reported, no further
882 * data may be fed to the #GMarkupParseContext; all errors are fatal.
884 * Return value: %FALSE if an error occurred, %TRUE on success
887 g_markup_parse_context_parse (GMarkupParseContext *context,
892 const gchar *first_invalid;
894 g_return_val_if_fail (context != NULL, FALSE);
895 g_return_val_if_fail (text != NULL, FALSE);
896 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
897 g_return_val_if_fail (!context->parsing, FALSE);
900 text_len = strlen (text);
905 context->parsing = TRUE;
907 if (context->leftover_char_portion)
909 const gchar *first_char;
911 if ((*text & 0xc0) != 0x80)
914 first_char = g_utf8_find_next_char (text, text + text_len);
918 /* leftover_char_portion was completed. Parse it. */
919 GString *portion = context->leftover_char_portion;
921 g_string_append_len (context->leftover_char_portion,
922 text, first_char - text);
924 /* hacks to allow recursion */
925 context->parsing = FALSE;
926 context->leftover_char_portion = NULL;
928 if (!g_markup_parse_context_parse (context,
929 portion->str, portion->len,
932 g_assert (context->state == STATE_ERROR);
935 g_string_free (portion, TRUE);
936 context->parsing = TRUE;
938 /* Skip the fraction of char that was in this text */
939 text_len -= (first_char - text);
944 /* another little chunk of the leftover char; geez
945 * someone is inefficient.
947 g_string_append_len (context->leftover_char_portion,
950 if (context->leftover_char_portion->len > 7)
952 /* The leftover char portion is too big to be
957 G_MARKUP_ERROR_BAD_UTF8,
958 _("Invalid UTF-8 encoded text"));
965 context->current_text = text;
966 context->current_text_len = text_len;
967 context->iter = context->current_text;
968 context->start = context->iter;
970 /* Nothing left after finishing the leftover char, or nothing
971 * passed in to begin with.
973 if (context->current_text_len == 0)
976 /* find_current_text_end () assumes the string starts at
977 * a character start, so we need to validate at least
978 * that much. It doesn't assume any following bytes
981 if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
985 G_MARKUP_ERROR_BAD_UTF8,
986 _("Invalid UTF-8 encoded text"));
990 /* Initialize context->current_text_end, possibly adjusting
991 * current_text_len, and add any leftover char portion
993 find_current_text_end (context);
995 /* Validate UTF8 (must be done after we find the end, since
996 * we could have a trailing incomplete char)
998 if (!g_utf8_validate (context->current_text,
999 context->current_text_len,
1004 q = p = context->current_text;
1005 while (p != first_invalid)
1011 context->char_number = 1;
1016 context->line_number += newlines;
1017 context->char_number += g_utf8_strlen (q, first_invalid - q);
1021 G_MARKUP_ERROR_BAD_UTF8,
1022 _("Invalid UTF-8 encoded text"));
1026 while (context->iter != context->current_text_end)
1028 switch (context->state)
1031 /* Possible next state: AFTER_OPEN_ANGLE */
1033 g_assert (context->tag_stack == NULL);
1035 /* whitespace is ignored outside of any elements */
1036 skip_spaces (context);
1038 if (context->iter != context->current_text_end)
1040 if (*context->iter == '<')
1042 /* Move after the open angle */
1043 advance_char (context);
1045 context->state = STATE_AFTER_OPEN_ANGLE;
1047 /* this could start a passthrough */
1048 context->start = context->iter;
1050 /* document is now non-empty */
1051 context->document_empty = FALSE;
1057 G_MARKUP_ERROR_PARSE,
1058 _("Document must begin with an element (e.g. <book>)"));
1063 case STATE_AFTER_OPEN_ANGLE:
1064 /* Possible next states: INSIDE_OPEN_TAG_NAME,
1065 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
1067 if (*context->iter == '?' ||
1068 *context->iter == '!')
1070 /* include < in the passthrough */
1071 const gchar *openangle = "<";
1072 add_to_partial (context, openangle, openangle + 1);
1073 context->start = context->iter;
1074 context->balance = 1;
1075 context->state = STATE_INSIDE_PASSTHROUGH;
1077 else if (*context->iter == '/')
1080 advance_char (context);
1082 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
1084 else if (is_name_start_char (context->iter))
1086 context->state = STATE_INSIDE_OPEN_TAG_NAME;
1088 /* start of tag name */
1089 context->start = context->iter;
1097 G_MARKUP_ERROR_PARSE,
1098 _("'%s' is not a valid character following "
1099 "a '<' character; it may not begin an "
1101 utf8_str (context->iter, buf));
1105 /* The AFTER_CLOSE_ANGLE state is actually sort of
1106 * broken, because it doesn't correspond to a range
1107 * of characters in the input stream as the others do,
1108 * and thus makes things harder to conceptualize
1110 case STATE_AFTER_CLOSE_ANGLE:
1111 /* Possible next states: INSIDE_TEXT, STATE_START */
1112 if (context->tag_stack == NULL)
1114 context->start = NULL;
1115 context->state = STATE_START;
1119 context->start = context->iter;
1120 context->state = STATE_INSIDE_TEXT;
1124 case STATE_AFTER_ELISION_SLASH:
1125 /* Possible next state: AFTER_CLOSE_ANGLE */
1128 /* We need to pop the tag stack and call the end_element
1129 * function, since this is the close tag
1131 GError *tmp_error = NULL;
1133 g_assert (context->tag_stack != NULL);
1136 if (context->parser->end_element)
1137 (* context->parser->end_element) (context,
1138 context->tag_stack->data,
1144 mark_error (context, tmp_error);
1145 g_propagate_error (error, tmp_error);
1149 if (*context->iter == '>')
1151 /* move after the close angle */
1152 advance_char (context);
1153 context->state = STATE_AFTER_CLOSE_ANGLE;
1161 G_MARKUP_ERROR_PARSE,
1162 _("Odd character '%s', expected a '>' character "
1163 "to end the start tag of element '%s'"),
1164 utf8_str (context->iter, buf),
1165 current_element (context));
1169 g_free (context->tag_stack->data);
1170 context->tag_stack = g_slist_delete_link (context->tag_stack,
1171 context->tag_stack);
1175 case STATE_INSIDE_OPEN_TAG_NAME:
1176 /* Possible next states: BETWEEN_ATTRIBUTES */
1178 /* if there's a partial chunk then it's the first part of the
1179 * tag name. If there's a context->start then it's the start
1180 * of the tag name in current_text, the partial chunk goes
1181 * before that start though.
1183 advance_to_name_end (context);
1185 if (context->iter == context->current_text_end)
1187 /* The name hasn't necessarily ended. Merge with
1188 * partial chunk, leave state unchanged.
1190 add_to_partial (context, context->start, context->iter);
1194 /* The name has ended. Combine it with the partial chunk
1195 * if any; push it on the stack; enter next state.
1197 add_to_partial (context, context->start, context->iter);
1198 context->tag_stack =
1199 g_slist_prepend (context->tag_stack,
1200 g_string_free (context->partial_chunk,
1203 context->partial_chunk = NULL;
1205 context->state = STATE_BETWEEN_ATTRIBUTES;
1206 context->start = NULL;
1210 case STATE_INSIDE_ATTRIBUTE_NAME:
1211 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1213 advance_to_name_end (context);
1214 add_to_partial (context, context->start, context->iter);
1216 /* read the full name, if we enter the equals sign state
1217 * then add the attribute to the list (without the value),
1218 * otherwise store a partial chunk to be prepended later.
1220 if (context->iter != context->current_text_end)
1221 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1224 case STATE_AFTER_ATTRIBUTE_NAME:
1225 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1227 skip_spaces (context);
1229 if (context->iter != context->current_text_end)
1231 /* The name has ended. Combine it with the partial chunk
1232 * if any; push it on the stack; enter next state.
1234 add_attribute (context, g_string_free (context->partial_chunk, FALSE));
1236 context->partial_chunk = NULL;
1237 context->start = NULL;
1239 if (*context->iter == '=')
1241 advance_char (context);
1242 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1250 G_MARKUP_ERROR_PARSE,
1251 _("Odd character '%s', expected a '=' after "
1252 "attribute name '%s' of element '%s'"),
1253 utf8_str (context->iter, buf),
1254 current_attribute (context),
1255 current_element (context));
1261 case STATE_BETWEEN_ATTRIBUTES:
1262 /* Possible next states: AFTER_CLOSE_ANGLE,
1263 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1265 skip_spaces (context);
1267 if (context->iter != context->current_text_end)
1269 if (*context->iter == '/')
1271 advance_char (context);
1272 context->state = STATE_AFTER_ELISION_SLASH;
1274 else if (*context->iter == '>')
1277 advance_char (context);
1278 context->state = STATE_AFTER_CLOSE_ANGLE;
1280 else if (is_name_start_char (context->iter))
1282 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1283 /* start of attribute name */
1284 context->start = context->iter;
1292 G_MARKUP_ERROR_PARSE,
1293 _("Odd character '%s', expected a '>' or '/' "
1294 "character to end the start tag of "
1295 "element '%s', or optionally an attribute; "
1296 "perhaps you used an invalid character in "
1297 "an attribute name"),
1298 utf8_str (context->iter, buf),
1299 current_element (context));
1302 /* If we're done with attributes, invoke
1303 * the start_element callback
1305 if (context->state == STATE_AFTER_ELISION_SLASH ||
1306 context->state == STATE_AFTER_CLOSE_ANGLE)
1308 const gchar *start_name;
1309 /* Ugly, but the current code expects an empty array instead of NULL */
1310 const gchar *empty = NULL;
1311 const gchar **attr_names = ∅
1312 const gchar **attr_values = ∅
1315 /* Call user callback for element start */
1316 start_name = current_element (context);
1318 if (context->cur_attr >= 0)
1320 attr_names = (const gchar**)context->attr_names;
1321 attr_values = (const gchar**)context->attr_values;
1325 if (context->parser->start_element)
1326 (* context->parser->start_element) (context,
1328 (const gchar **)attr_names,
1329 (const gchar **)attr_values,
1333 /* Go ahead and free the attributes. */
1334 for (; context->cur_attr >= 0; context->cur_attr--)
1336 int pos = context->cur_attr;
1337 g_free (context->attr_names[pos]);
1338 g_free (context->attr_values[pos]);
1339 context->attr_names[pos] = context->attr_values[pos] = NULL;
1341 g_assert (context->cur_attr == -1);
1342 g_assert (context->attr_names == NULL ||
1343 context->attr_names[0] == NULL);
1344 g_assert (context->attr_values == NULL ||
1345 context->attr_values[0] == NULL);
1347 if (tmp_error != NULL)
1349 mark_error (context, tmp_error);
1350 g_propagate_error (error, tmp_error);
1356 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1357 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1359 skip_spaces (context);
1361 if (context->iter != context->current_text_end)
1363 if (*context->iter == '"')
1365 advance_char (context);
1366 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1367 context->start = context->iter;
1369 else if (*context->iter == '\'')
1371 advance_char (context);
1372 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1373 context->start = context->iter;
1381 G_MARKUP_ERROR_PARSE,
1382 _("Odd character '%s', expected an open quote mark "
1383 "after the equals sign when giving value for "
1384 "attribute '%s' of element '%s'"),
1385 utf8_str (context->iter, buf),
1386 current_attribute (context),
1387 current_element (context));
1392 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1393 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1394 /* Possible next states: BETWEEN_ATTRIBUTES */
1398 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1409 if (*context->iter == delim)
1412 while (advance_char (context));
1414 if (context->iter == context->current_text_end)
1416 /* The value hasn't necessarily ended. Merge with
1417 * partial chunk, leave state unchanged.
1419 add_to_partial (context, context->start, context->iter);
1423 /* The value has ended at the quote mark. Combine it
1424 * with the partial chunk if any; set it for the current
1429 add_to_partial (context, context->start, context->iter);
1431 g_assert (context->cur_attr >= 0);
1433 if (unescape_text (context,
1434 context->partial_chunk->str,
1435 context->partial_chunk->str +
1436 context->partial_chunk->len,
1440 /* success, advance past quote and set state. */
1441 context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
1442 advance_char (context);
1443 context->state = STATE_BETWEEN_ATTRIBUTES;
1444 context->start = NULL;
1447 truncate_partial (context);
1451 case STATE_INSIDE_TEXT:
1452 /* Possible next states: AFTER_OPEN_ANGLE */
1455 if (*context->iter == '<')
1458 while (advance_char (context));
1460 /* The text hasn't necessarily ended. Merge with
1461 * partial chunk, leave state unchanged.
1464 add_to_partial (context, context->start, context->iter);
1466 if (context->iter != context->current_text_end)
1468 GString *unescaped = NULL;
1470 /* The text has ended at the open angle. Call the text
1474 if (unescape_text (context,
1475 context->partial_chunk->str,
1476 context->partial_chunk->str +
1477 context->partial_chunk->len,
1481 GError *tmp_error = NULL;
1483 if (context->parser->text)
1484 (*context->parser->text) (context,
1490 g_string_free (unescaped, TRUE);
1492 if (tmp_error == NULL)
1494 /* advance past open angle and set state. */
1495 advance_char (context);
1496 context->state = STATE_AFTER_OPEN_ANGLE;
1497 /* could begin a passthrough */
1498 context->start = context->iter;
1502 mark_error (context, tmp_error);
1503 g_propagate_error (error, tmp_error);
1507 truncate_partial (context);
1511 case STATE_AFTER_CLOSE_TAG_SLASH:
1512 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1513 if (is_name_start_char (context->iter))
1515 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1517 /* start of tag name */
1518 context->start = context->iter;
1526 G_MARKUP_ERROR_PARSE,
1527 _("'%s' is not a valid character following "
1528 "the characters '</'; '%s' may not begin an "
1530 utf8_str (context->iter, buf),
1531 utf8_str (context->iter, buf));
1535 case STATE_INSIDE_CLOSE_TAG_NAME:
1536 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1537 advance_to_name_end (context);
1538 add_to_partial (context, context->start, context->iter);
1540 if (context->iter != context->current_text_end)
1541 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1544 case STATE_AFTER_CLOSE_TAG_NAME:
1545 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1547 skip_spaces (context);
1549 if (context->iter != context->current_text_end)
1553 /* The name has ended. Combine it with the partial chunk
1554 * if any; check that it matches stack top and pop
1555 * stack; invoke proper callback; enter next state.
1557 close_name = g_string_free (context->partial_chunk, FALSE);
1558 context->partial_chunk = NULL;
1560 if (*context->iter != '>')
1566 G_MARKUP_ERROR_PARSE,
1567 _("'%s' is not a valid character following "
1568 "the close element name '%s'; the allowed "
1569 "character is '>'"),
1570 utf8_str (context->iter, buf),
1573 else if (context->tag_stack == NULL)
1577 G_MARKUP_ERROR_PARSE,
1578 _("Element '%s' was closed, no element "
1579 "is currently open"),
1582 else if (strcmp (close_name, current_element (context)) != 0)
1586 G_MARKUP_ERROR_PARSE,
1587 _("Element '%s' was closed, but the currently "
1588 "open element is '%s'"),
1590 current_element (context));
1595 advance_char (context);
1596 context->state = STATE_AFTER_CLOSE_ANGLE;
1597 context->start = NULL;
1599 /* call the end_element callback */
1601 if (context->parser->end_element)
1602 (* context->parser->end_element) (context,
1608 /* Pop the tag stack */
1609 g_free (context->tag_stack->data);
1610 context->tag_stack = g_slist_delete_link (context->tag_stack,
1611 context->tag_stack);
1615 mark_error (context, tmp_error);
1616 g_propagate_error (error, tmp_error);
1620 g_free (close_name);
1624 case STATE_INSIDE_PASSTHROUGH:
1625 /* Possible next state: AFTER_CLOSE_ANGLE */
1628 if (*context->iter == '<')
1630 if (*context->iter == '>')
1636 add_to_partial (context, context->start, context->iter);
1637 context->start = context->iter;
1639 str = context->partial_chunk->str;
1640 len = context->partial_chunk->len;
1642 if (str[1] == '?' && str[len - 1] == '?')
1644 if (strncmp (str, "<!--", 4) == 0 &&
1645 strcmp (str + len - 2, "--") == 0)
1647 if (strncmp (str, "<![CDATA[", 9) == 0 &&
1648 strcmp (str + len - 2, "]]") == 0)
1650 if (strncmp (str, "<!DOCTYPE", 9) == 0 &&
1651 context->balance == 0)
1655 while (advance_char (context));
1657 if (context->iter == context->current_text_end)
1659 /* The passthrough hasn't necessarily ended. Merge with
1660 * partial chunk, leave state unchanged.
1662 add_to_partial (context, context->start, context->iter);
1666 /* The passthrough has ended at the close angle. Combine
1667 * it with the partial chunk if any. Call the passthrough
1668 * callback. Note that the open/close angles are
1669 * included in the text of the passthrough.
1671 GError *tmp_error = NULL;
1673 advance_char (context); /* advance past close angle */
1674 add_to_partial (context, context->start, context->iter);
1676 if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT &&
1677 strncmp (context->partial_chunk->str, "<![CDATA[", 9) == 0)
1679 if (context->parser->text)
1680 (*context->parser->text) (context,
1681 context->partial_chunk->str + 9,
1682 context->partial_chunk->len - 12,
1686 else if (context->parser->passthrough)
1687 (*context->parser->passthrough) (context,
1688 context->partial_chunk->str,
1689 context->partial_chunk->len,
1693 truncate_partial (context);
1695 if (tmp_error == NULL)
1697 context->state = STATE_AFTER_CLOSE_ANGLE;
1698 context->start = context->iter; /* could begin text */
1702 mark_error (context, tmp_error);
1703 g_propagate_error (error, tmp_error);
1713 g_assert_not_reached ();
1719 context->parsing = FALSE;
1721 return context->state != STATE_ERROR;
1725 * g_markup_parse_context_end_parse:
1726 * @context: a #GMarkupParseContext
1727 * @error: return location for a #GError
1729 * Signals to the #GMarkupParseContext that all data has been
1730 * fed into the parse context with g_markup_parse_context_parse().
1731 * This function reports an error if the document isn't complete,
1732 * for example if elements are still open.
1734 * Return value: %TRUE on success, %FALSE if an error was set
1737 g_markup_parse_context_end_parse (GMarkupParseContext *context,
1740 g_return_val_if_fail (context != NULL, FALSE);
1741 g_return_val_if_fail (!context->parsing, FALSE);
1742 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
1744 if (context->partial_chunk != NULL)
1746 g_string_free (context->partial_chunk, TRUE);
1747 context->partial_chunk = NULL;
1750 if (context->document_empty)
1752 set_error (context, error, G_MARKUP_ERROR_EMPTY,
1753 _("Document was empty or contained only whitespace"));
1757 context->parsing = TRUE;
1759 switch (context->state)
1765 case STATE_AFTER_OPEN_ANGLE:
1766 set_error (context, error, G_MARKUP_ERROR_PARSE,
1767 _("Document ended unexpectedly just after an open angle bracket '<'"));
1770 case STATE_AFTER_CLOSE_ANGLE:
1771 if (context->tag_stack != NULL)
1773 /* Error message the same as for INSIDE_TEXT */
1774 set_error (context, error, G_MARKUP_ERROR_PARSE,
1775 _("Document ended unexpectedly with elements still open - "
1776 "'%s' was the last element opened"),
1777 current_element (context));
1781 case STATE_AFTER_ELISION_SLASH:
1782 set_error (context, error, G_MARKUP_ERROR_PARSE,
1783 _("Document ended unexpectedly, expected to see a close angle "
1784 "bracket ending the tag <%s/>"), current_element (context));
1787 case STATE_INSIDE_OPEN_TAG_NAME:
1788 set_error (context, error, G_MARKUP_ERROR_PARSE,
1789 _("Document ended unexpectedly inside an element name"));
1792 case STATE_INSIDE_ATTRIBUTE_NAME:
1793 set_error (context, error, G_MARKUP_ERROR_PARSE,
1794 _("Document ended unexpectedly inside an attribute name"));
1797 case STATE_BETWEEN_ATTRIBUTES:
1798 set_error (context, error, G_MARKUP_ERROR_PARSE,
1799 _("Document ended unexpectedly inside an element-opening "
1803 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1804 set_error (context, error, G_MARKUP_ERROR_PARSE,
1805 _("Document ended unexpectedly after the equals sign "
1806 "following an attribute name; no attribute value"));
1809 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1810 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1811 set_error (context, error, G_MARKUP_ERROR_PARSE,
1812 _("Document ended unexpectedly while inside an attribute "
1816 case STATE_INSIDE_TEXT:
1817 g_assert (context->tag_stack != NULL);
1818 set_error (context, error, G_MARKUP_ERROR_PARSE,
1819 _("Document ended unexpectedly with elements still open - "
1820 "'%s' was the last element opened"),
1821 current_element (context));
1824 case STATE_AFTER_CLOSE_TAG_SLASH:
1825 case STATE_INSIDE_CLOSE_TAG_NAME:
1826 set_error (context, error, G_MARKUP_ERROR_PARSE,
1827 _("Document ended unexpectedly inside the close tag for "
1828 "element '%s'"), current_element (context));
1831 case STATE_INSIDE_PASSTHROUGH:
1832 set_error (context, error, G_MARKUP_ERROR_PARSE,
1833 _("Document ended unexpectedly inside a comment or "
1834 "processing instruction"));
1839 g_assert_not_reached ();
1843 context->parsing = FALSE;
1845 return context->state != STATE_ERROR;
1849 * g_markup_parse_context_get_element:
1850 * @context: a #GMarkupParseContext
1851 * @returns: the name of the currently open element, or %NULL
1853 * Retrieves the name of the currently open element.
1857 G_CONST_RETURN gchar *
1858 g_markup_parse_context_get_element (GMarkupParseContext *context)
1860 g_return_val_if_fail (context != NULL, NULL);
1862 if (context->tag_stack == NULL)
1865 return current_element (context);
1869 * g_markup_parse_context_get_position:
1870 * @context: a #GMarkupParseContext
1871 * @line_number: return location for a line number, or %NULL
1872 * @char_number: return location for a char-on-line number, or %NULL
1874 * Retrieves the current line number and the number of the character on
1875 * that line. Intended for use in error messages; there are no strict
1876 * semantics for what constitutes the "current" line number other than
1877 * "the best number we could come up with for error messages."
1881 g_markup_parse_context_get_position (GMarkupParseContext *context,
1885 g_return_if_fail (context != NULL);
1888 *line_number = context->line_number;
1891 *char_number = context->char_number;
1895 append_escaped_text (GString *str,
1903 end = text + length;
1908 next = g_utf8_next_char (p);
1913 g_string_append (str, "&");
1917 g_string_append (str, "<");
1921 g_string_append (str, ">");
1925 g_string_append (str, "'");
1929 g_string_append (str, """);
1933 g_string_append_len (str, p, next - p);
1942 * g_markup_escape_text:
1943 * @text: some valid UTF-8 text
1944 * @length: length of @text in bytes, or -1 if the text is nul-terminated
1946 * Escapes text so that the markup parser will parse it verbatim.
1947 * Less than, greater than, ampersand, etc. are replaced with the
1948 * corresponding entities. This function would typically be used
1949 * when writing out a file to be parsed with the markup parser.
1951 * Note that this function doesn't protect whitespace and line endings
1952 * from being processed according to the XML rules for normalization
1953 * of line endings and attribute values.
1955 * Return value: a newly allocated string with the escaped text
1958 g_markup_escape_text (const gchar *text,
1963 g_return_val_if_fail (text != NULL, NULL);
1966 length = strlen (text);
1968 /* prealloc at least as long as original text */
1969 str = g_string_sized_new (length);
1970 append_escaped_text (str, text, length);
1972 return g_string_free (str, FALSE);
1977 * @format: a printf-style format string
1978 * @after: location to store a pointer to the character after
1979 * the returned conversion. On a %NULL return, returns the
1980 * pointer to the trailing NUL in the string
1982 * Find the next conversion in a printf-style format string.
1983 * Partially based on code from printf-parser.c,
1984 * Copyright (C) 1999-2000, 2002-2003 Free Software Foundation, Inc.
1986 * Return value: pointer to the next conversion in @format,
1987 * or %NULL, if none.
1990 find_conversion (const char *format,
1993 const char *start = format;
1996 while (*start != '\0' && *start != '%')
2013 /* Test for positional argument. */
2014 if (*cp >= '0' && *cp <= '9')
2018 for (np = cp; *np >= '0' && *np <= '9'; np++)
2024 /* Skip the flags. */
2038 /* Skip the field width. */
2043 /* Test for positional argument. */
2044 if (*cp >= '0' && *cp <= '9')
2048 for (np = cp; *np >= '0' && *np <= '9'; np++)
2056 for (; *cp >= '0' && *cp <= '9'; cp++)
2060 /* Skip the precision. */
2066 /* Test for positional argument. */
2067 if (*cp >= '0' && *cp <= '9')
2071 for (np = cp; *np >= '0' && *np <= '9'; np++)
2079 for (; *cp >= '0' && *cp <= '9'; cp++)
2084 /* Skip argument type/size specifiers. */
2085 while (*cp == 'h' ||
2094 /* Skip the conversion character. */
2102 * g_markup_vprintf_escaped:
2103 * @format: printf() style format string
2104 * @args: variable argument list, similar to vprintf()
2106 * Formats the data in @args according to @format, escaping
2107 * all string and character arguments in the fashion
2108 * of g_markup_escape_text(). See g_markup_printf_escaped().
2110 * Return value: newly allocated result from formatting
2111 * operation. Free with g_free().
2116 g_markup_vprintf_escaped (const char *format,
2121 GString *result = NULL;
2122 gchar *output1 = NULL;
2123 gchar *output2 = NULL;
2124 const char *p, *op1, *op2;
2127 /* The technique here, is that we make two format strings that
2128 * have the identical conversions in the identical order to the
2129 * original strings, but differ in the text in-between. We
2130 * then use the normal g_strdup_vprintf() to format the arguments
2131 * with the two new format strings. By comparing the results,
2132 * we can figure out what segments of the output come from
2133 * the the original format string, and what from the arguments,
2134 * and thus know what portions of the string to escape.
2136 * For instance, for:
2138 * g_markup_printf_escaped ("%s ate %d apples", "Susan & Fred", 5);
2140 * We form the two format strings "%sX%dX" and %sY%sY". The results
2141 * of formatting with those two strings are
2143 * "%sX%dX" => "Susan & FredX5X"
2144 * "%sY%dY" => "Susan & FredY5Y"
2146 * To find the span of the first argument, we find the first position
2147 * where the two arguments differ, which tells us that the first
2148 * argument formatted to "Susan & Fred". We then escape that
2149 * to "Susan & Fred" and join up with the intermediate portions
2150 * of the format string and the second argument to get
2151 * "Susan & Fred ate 5 apples".
2154 /* Create the two modified format strings
2156 format1 = g_string_new (NULL);
2157 format2 = g_string_new (NULL);
2162 const char *conv = find_conversion (p, &after);
2166 g_string_append_len (format1, conv, after - conv);
2167 g_string_append_c (format1, 'X');
2168 g_string_append_len (format2, conv, after - conv);
2169 g_string_append_c (format2, 'Y');
2174 /* Use them to format the arguments
2176 G_VA_COPY (args2, args);
2178 output1 = g_strdup_vprintf (format1->str, args);
2185 output2 = g_strdup_vprintf (format2->str, args2);
2190 result = g_string_new (NULL);
2192 /* Iterate through the original format string again,
2193 * copying the non-conversion portions and the escaped
2194 * converted arguments to the output string.
2202 const char *output_start;
2203 const char *conv = find_conversion (p, &after);
2206 if (!conv) /* The end, after points to the trailing \0 */
2208 g_string_append_len (result, p, after - p);
2212 g_string_append_len (result, p, conv - p);
2214 while (*op1 == *op2)
2220 escaped = g_markup_escape_text (output_start, op1 - output_start);
2221 g_string_append (result, escaped);
2230 g_string_free (format1, TRUE);
2231 g_string_free (format2, TRUE);
2236 return g_string_free (result, FALSE);
2242 * g_markup_printf_escaped:
2243 * @format: printf() style format string
2244 * @Varargs: the arguments to insert in the format string
2246 * Formats arguments according to @format, escaping
2247 * all string and character arguments in the fashion
2248 * of g_markup_escape_text(). This is useful when you
2249 * want to insert literal strings into XML-style markup
2250 * output, without having to worry that the strings
2251 * might themselves contain markup.
2253 * <informalexample><programlisting>
2254 * const char *store = "Fortnum & Mason";
2255 * const char *item = "Tea";
2258 * output = g_markup_printf_escaped ("<purchase>"
2259 * "<store>%s</store>"
2260 * "<item>%s</item>"
2261 * "</purchase>",
2263 * </programlisting></informalexample>
2265 * Return value: newly allocated result from formatting
2266 * operation. Free with g_free().
2271 g_markup_printf_escaped (const char *format, ...)
2276 va_start (args, format);
2277 result = g_markup_vprintf_escaped (format, args);
2283 #define __G_MARKUP_C__
2284 #include "galiasdef.c"