1 /* gmarkup.c - Simple XML-like parser
3 * Copyright 2000, 2003 Red Hat, Inc.
5 * GLib is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * GLib is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with GLib; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
35 g_markup_error_quark (void)
37 static GQuark error_quark = 0;
40 error_quark = g_quark_from_static_string ("g-markup-error-quark");
48 STATE_AFTER_OPEN_ANGLE,
49 STATE_AFTER_CLOSE_ANGLE,
50 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
51 STATE_INSIDE_OPEN_TAG_NAME,
52 STATE_INSIDE_ATTRIBUTE_NAME,
53 STATE_AFTER_ATTRIBUTE_NAME,
54 STATE_BETWEEN_ATTRIBUTES,
55 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
56 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
57 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
59 STATE_AFTER_CLOSE_TAG_SLASH,
60 STATE_INSIDE_CLOSE_TAG_NAME,
61 STATE_AFTER_CLOSE_TAG_NAME,
62 STATE_INSIDE_PASSTHROUGH,
66 struct _GMarkupParseContext
68 const GMarkupParser *parser;
70 GMarkupParseFlags flags;
76 GDestroyNotify dnotify;
78 /* A piece of character data or an element that
79 * hasn't "ended" yet so we haven't yet called
80 * the callback for it.
82 GString *partial_chunk;
84 GMarkupParseState state;
91 const gchar *current_text;
92 gssize current_text_len;
93 const gchar *current_text_end;
95 GString *leftover_char_portion;
97 /* used to save the start of the last interesting thingy */
102 guint document_empty : 1;
108 * g_markup_parse_context_new:
109 * @parser: a #GMarkupParser
110 * @flags: one or more #GMarkupParseFlags
111 * @user_data: user data to pass to #GMarkupParser functions
112 * @user_data_dnotify: user data destroy notifier called when the parse context is freed
114 * Creates a new parse context. A parse context is used to parse
115 * marked-up documents. You can feed any number of documents into
116 * a context, as long as no errors occur; once an error occurs,
117 * the parse context can't continue to parse text (you have to free it
118 * and create a new parse context).
120 * Return value: a new #GMarkupParseContext
122 GMarkupParseContext *
123 g_markup_parse_context_new (const GMarkupParser *parser,
124 GMarkupParseFlags flags,
126 GDestroyNotify user_data_dnotify)
128 GMarkupParseContext *context;
130 g_return_val_if_fail (parser != NULL, NULL);
132 context = g_new (GMarkupParseContext, 1);
134 context->parser = parser;
135 context->flags = flags;
136 context->user_data = user_data;
137 context->dnotify = user_data_dnotify;
139 context->line_number = 1;
140 context->char_number = 1;
142 context->partial_chunk = NULL;
144 context->state = STATE_START;
145 context->tag_stack = NULL;
146 context->attr_names = NULL;
147 context->attr_values = NULL;
148 context->cur_attr = -1;
149 context->alloc_attrs = 0;
151 context->current_text = NULL;
152 context->current_text_len = -1;
153 context->current_text_end = NULL;
154 context->leftover_char_portion = NULL;
156 context->start = NULL;
157 context->iter = NULL;
159 context->document_empty = TRUE;
160 context->parsing = FALSE;
162 context->balance = 0;
168 * g_markup_parse_context_free:
169 * @context: a #GMarkupParseContext
171 * Frees a #GMarkupParseContext. Can't be called from inside
172 * one of the #GMarkupParser functions.
176 g_markup_parse_context_free (GMarkupParseContext *context)
178 g_return_if_fail (context != NULL);
179 g_return_if_fail (!context->parsing);
181 if (context->dnotify)
182 (* context->dnotify) (context->user_data);
184 g_strfreev (context->attr_names);
185 g_strfreev (context->attr_values);
187 g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
188 g_slist_free (context->tag_stack);
190 if (context->partial_chunk)
191 g_string_free (context->partial_chunk, TRUE);
193 if (context->leftover_char_portion)
194 g_string_free (context->leftover_char_portion, TRUE);
200 mark_error (GMarkupParseContext *context,
203 context->state = STATE_ERROR;
205 if (context->parser->error)
206 (*context->parser->error) (context, error, context->user_data);
210 set_error (GMarkupParseContext *context,
220 va_start (args, format);
221 s = g_strdup_vprintf (format, args);
224 tmp_error = g_error_new (G_MARKUP_ERROR,
226 _("Error on line %d char %d: %s"),
227 context->line_number,
228 context->char_number,
233 mark_error (context, tmp_error);
235 g_propagate_error (error, tmp_error);
239 /* To make these faster, we first use the ascii-only tests, then check
240 * for the usual non-alnum name-end chars, and only then call the
241 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
242 * names, so this is a reasonable hack that virtually always avoids
245 #define IS_COMMON_NAME_END_CHAR(c) \
246 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
249 is_name_start_char (const gchar *p)
251 if (g_ascii_isalpha (*p) ||
252 (!IS_COMMON_NAME_END_CHAR (*p) &&
255 g_unichar_isalpha (g_utf8_get_char (p)))))
262 is_name_char (const gchar *p)
264 if (g_ascii_isalnum (*p) ||
265 (!IS_COMMON_NAME_END_CHAR (*p) &&
270 g_unichar_isalpha (g_utf8_get_char (p)))))
278 char_str (gunichar c,
282 g_unichar_to_utf8 (c, buf);
287 utf8_str (const gchar *utf8,
290 char_str (g_utf8_get_char (utf8), buf);
295 set_unescape_error (GMarkupParseContext *context,
297 const gchar *remaining_text,
298 const gchar *remaining_text_end,
306 gint remaining_newlines;
309 remaining_newlines = 0;
311 while (p != remaining_text_end)
314 ++remaining_newlines;
318 va_start (args, format);
319 s = g_strdup_vprintf (format, args);
322 tmp_error = g_error_new (G_MARKUP_ERROR,
324 _("Error on line %d: %s"),
325 context->line_number - remaining_newlines,
330 mark_error (context, tmp_error);
332 g_propagate_error (error, tmp_error);
338 USTATE_AFTER_AMPERSAND,
339 USTATE_INSIDE_ENTITY_NAME,
340 USTATE_AFTER_CHARREF_HASH
345 GMarkupParseContext *context;
349 const gchar *text_end;
350 const gchar *entity_start;
354 unescape_text_state_inside_text (UnescapeContext *ucontext,
359 gboolean normalize_attribute;
361 if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
362 ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
363 normalize_attribute = TRUE;
365 normalize_attribute = FALSE;
369 while (p != ucontext->text_end)
375 else if (normalize_attribute && (*p == '\t' || *p == '\n'))
377 g_string_append_len (ucontext->str, start, p - start);
378 g_string_append_c (ucontext->str, ' ');
379 p = g_utf8_next_char (p);
384 g_string_append_len (ucontext->str, start, p - start);
385 g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
386 p = g_utf8_next_char (p);
387 if (p != ucontext->text_end && *p == '\n')
388 p = g_utf8_next_char (p);
392 p = g_utf8_next_char (p);
396 g_string_append_len (ucontext->str, start, p - start);
398 if (p != ucontext->text_end && *p == '&')
400 p = g_utf8_next_char (p);
401 ucontext->state = USTATE_AFTER_AMPERSAND;
408 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
412 ucontext->entity_start = NULL;
416 p = g_utf8_next_char (p);
418 ucontext->entity_start = p;
419 ucontext->state = USTATE_AFTER_CHARREF_HASH;
421 else if (!is_name_start_char (p))
425 set_unescape_error (ucontext->context, error,
426 p, ucontext->text_end,
427 G_MARKUP_ERROR_PARSE,
428 _("Empty entity '&;' seen; valid "
429 "entities are: & " < > '"));
435 set_unescape_error (ucontext->context, error,
436 p, ucontext->text_end,
437 G_MARKUP_ERROR_PARSE,
438 _("Character '%s' is not valid at "
439 "the start of an entity name; "
440 "the & character begins an entity; "
441 "if this ampersand isn't supposed "
442 "to be an entity, escape it as "
449 ucontext->entity_start = p;
450 ucontext->state = USTATE_INSIDE_ENTITY_NAME;
457 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
461 #define MAX_ENT_LEN 5
462 gchar buf[MAX_ENT_LEN+1] = {
463 '\0', '\0', '\0', '\0', '\0', '\0'
467 while (p != ucontext->text_end)
471 else if (!is_name_char (p))
475 set_unescape_error (ucontext->context, error,
476 p, ucontext->text_end,
477 G_MARKUP_ERROR_PARSE,
478 _("Character '%s' is not valid "
479 "inside an entity name"),
484 p = g_utf8_next_char (p);
487 if (ucontext->context->state != STATE_ERROR)
489 if (p != ucontext->text_end)
493 src = ucontext->entity_start;
502 /* move to after semicolon */
503 p = g_utf8_next_char (p);
504 ucontext->state = USTATE_INSIDE_TEXT;
506 if (strcmp (buf, "lt") == 0)
507 g_string_append_c (ucontext->str, '<');
508 else if (strcmp (buf, "gt") == 0)
509 g_string_append_c (ucontext->str, '>');
510 else if (strcmp (buf, "amp") == 0)
511 g_string_append_c (ucontext->str, '&');
512 else if (strcmp (buf, "quot") == 0)
513 g_string_append_c (ucontext->str, '"');
514 else if (strcmp (buf, "apos") == 0)
515 g_string_append_c (ucontext->str, '\'');
518 set_unescape_error (ucontext->context, error,
519 p, ucontext->text_end,
520 G_MARKUP_ERROR_PARSE,
521 _("Entity name '%s' is not known"),
527 set_unescape_error (ucontext->context, error,
528 /* give line number of the & */
529 ucontext->entity_start, ucontext->text_end,
530 G_MARKUP_ERROR_PARSE,
531 _("Entity did not end with a semicolon; "
532 "most likely you used an ampersand "
533 "character without intending to start "
534 "an entity - escape ampersand as &"));
543 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
547 gboolean is_hex = FALSE;
550 start = ucontext->entity_start;
555 p = g_utf8_next_char (p);
559 while (p != ucontext->text_end && *p != ';')
560 p = g_utf8_next_char (p);
562 if (p != ucontext->text_end)
564 g_assert (*p == ';');
566 /* digit is between start and p */
575 l = strtoul (start, &end, 16);
577 l = strtoul (start, &end, 10);
579 if (end != p || errno != 0)
581 set_unescape_error (ucontext->context, error,
582 start, ucontext->text_end,
583 G_MARKUP_ERROR_PARSE,
584 _("Failed to parse '%-.*s', which "
585 "should have been a digit "
586 "inside a character reference "
587 "(ê for example) - perhaps "
588 "the digit is too large"),
593 /* characters XML permits */
597 (l >= 0x20 && l <= 0xD7FF) ||
598 (l >= 0xE000 && l <= 0xFFFD) ||
599 (l >= 0x10000 && l <= 0x10FFFF))
602 g_string_append (ucontext->str, char_str (l, buf));
606 set_unescape_error (ucontext->context, error,
607 start, ucontext->text_end,
608 G_MARKUP_ERROR_PARSE,
609 _("Character reference '%-.*s' does not "
610 "encode a permitted character"),
615 /* Move to next state */
616 p = g_utf8_next_char (p); /* past semicolon */
617 ucontext->state = USTATE_INSIDE_TEXT;
621 set_unescape_error (ucontext->context, error,
622 start, ucontext->text_end,
623 G_MARKUP_ERROR_PARSE,
624 _("Empty character reference; "
625 "should include a digit such as "
631 set_unescape_error (ucontext->context, error,
632 start, ucontext->text_end,
633 G_MARKUP_ERROR_PARSE,
634 _("Character reference did not end with a "
636 "most likely you used an ampersand "
637 "character without intending to start "
638 "an entity - escape ampersand as &"));
645 unescape_text (GMarkupParseContext *context,
647 const gchar *text_end,
651 UnescapeContext ucontext;
654 ucontext.context = context;
655 ucontext.text = text;
656 ucontext.text_end = text_end;
657 ucontext.entity_start = NULL;
659 ucontext.str = g_string_sized_new (text_end - text);
661 ucontext.state = USTATE_INSIDE_TEXT;
664 while (p != text_end && context->state != STATE_ERROR)
666 g_assert (p < text_end);
668 switch (ucontext.state)
670 case USTATE_INSIDE_TEXT:
672 p = unescape_text_state_inside_text (&ucontext,
678 case USTATE_AFTER_AMPERSAND:
680 p = unescape_text_state_after_ampersand (&ucontext,
687 case USTATE_INSIDE_ENTITY_NAME:
689 p = unescape_text_state_inside_entity_name (&ucontext,
695 case USTATE_AFTER_CHARREF_HASH:
697 p = unescape_text_state_after_charref_hash (&ucontext,
704 g_assert_not_reached ();
709 if (context->state != STATE_ERROR)
711 switch (ucontext.state)
713 case USTATE_INSIDE_TEXT:
715 case USTATE_AFTER_AMPERSAND:
716 case USTATE_INSIDE_ENTITY_NAME:
717 set_unescape_error (context, error,
719 G_MARKUP_ERROR_PARSE,
720 _("Unfinished entity reference"));
722 case USTATE_AFTER_CHARREF_HASH:
723 set_unescape_error (context, error,
725 G_MARKUP_ERROR_PARSE,
726 _("Unfinished character reference"));
731 if (context->state == STATE_ERROR)
733 g_string_free (ucontext.str, TRUE);
739 *unescaped = ucontext.str;
744 static inline gboolean
745 advance_char (GMarkupParseContext *context)
747 context->iter = g_utf8_next_char (context->iter);
748 context->char_number += 1;
750 if (context->iter == context->current_text_end)
754 else if (*context->iter == '\n')
756 context->line_number += 1;
757 context->char_number = 1;
763 static inline gboolean
766 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
770 skip_spaces (GMarkupParseContext *context)
774 if (!xml_isspace (*context->iter))
777 while (advance_char (context));
781 advance_to_name_end (GMarkupParseContext *context)
785 if (!is_name_char (context->iter))
788 while (advance_char (context));
792 add_to_partial (GMarkupParseContext *context,
793 const gchar *text_start,
794 const gchar *text_end)
796 if (context->partial_chunk == NULL)
797 context->partial_chunk = g_string_sized_new (text_end - text_start);
799 if (text_start != text_end)
800 g_string_append_len (context->partial_chunk, text_start,
801 text_end - text_start);
803 /* Invariant here that partial_chunk exists */
807 truncate_partial (GMarkupParseContext *context)
809 if (context->partial_chunk != NULL)
811 context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
816 current_element (GMarkupParseContext *context)
818 return context->tag_stack->data;
822 current_attribute (GMarkupParseContext *context)
824 g_assert (context->cur_attr >= 0);
825 return context->attr_names[context->cur_attr];
829 find_current_text_end (GMarkupParseContext *context)
831 /* This function must be safe (non-segfaulting) on invalid UTF8.
832 * It assumes the string starts with a character start
834 const gchar *end = context->current_text + context->current_text_len;
838 g_assert (context->current_text_len > 0);
840 p = g_utf8_find_prev_char (context->current_text, end);
842 g_assert (p != NULL); /* since current_text was a char start */
844 /* p is now the start of the last character or character portion. */
846 next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
850 /* whole character */
851 context->current_text_end = end;
856 context->leftover_char_portion = g_string_new_len (p, end - p);
857 context->current_text_len -= (end - p);
858 context->current_text_end = p;
864 add_attribute (GMarkupParseContext *context, char *name)
866 if (context->cur_attr + 2 >= context->alloc_attrs)
868 context->alloc_attrs += 5; /* silly magic number */
869 context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
870 context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
873 context->attr_names[context->cur_attr] = name;
874 context->attr_values[context->cur_attr] = NULL;
875 context->attr_names[context->cur_attr+1] = NULL;
876 context->attr_values[context->cur_attr+1] = NULL;
880 * g_markup_parse_context_parse:
881 * @context: a #GMarkupParseContext
882 * @text: chunk of text to parse
883 * @text_len: length of @text in bytes
884 * @error: return location for a #GError
886 * Feed some data to the #GMarkupParseContext. The data need not
887 * be valid UTF-8; an error will be signaled if it's invalid.
888 * The data need not be an entire document; you can feed a document
889 * into the parser incrementally, via multiple calls to this function.
890 * Typically, as you receive data from a network connection or file,
891 * you feed each received chunk of data into this function, aborting
892 * the process if an error occurs. Once an error is reported, no further
893 * data may be fed to the #GMarkupParseContext; all errors are fatal.
895 * Return value: %FALSE if an error occurred, %TRUE on success
898 g_markup_parse_context_parse (GMarkupParseContext *context,
903 const gchar *first_invalid;
905 g_return_val_if_fail (context != NULL, FALSE);
906 g_return_val_if_fail (text != NULL, FALSE);
907 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
908 g_return_val_if_fail (!context->parsing, FALSE);
911 text_len = strlen (text);
916 context->parsing = TRUE;
918 if (context->leftover_char_portion)
920 const gchar *first_char;
922 if ((*text & 0xc0) != 0x80)
925 first_char = g_utf8_find_next_char (text, text + text_len);
929 /* leftover_char_portion was completed. Parse it. */
930 GString *portion = context->leftover_char_portion;
932 g_string_append_len (context->leftover_char_portion,
933 text, first_char - text);
935 /* hacks to allow recursion */
936 context->parsing = FALSE;
937 context->leftover_char_portion = NULL;
939 if (!g_markup_parse_context_parse (context,
940 portion->str, portion->len,
943 g_assert (context->state == STATE_ERROR);
946 g_string_free (portion, TRUE);
947 context->parsing = TRUE;
949 /* Skip the fraction of char that was in this text */
950 text_len -= (first_char - text);
955 /* another little chunk of the leftover char; geez
956 * someone is inefficient.
958 g_string_append_len (context->leftover_char_portion,
961 if (context->leftover_char_portion->len > 7)
963 /* The leftover char portion is too big to be
968 G_MARKUP_ERROR_BAD_UTF8,
969 _("Invalid UTF-8 encoded text"));
976 context->current_text = text;
977 context->current_text_len = text_len;
978 context->iter = context->current_text;
979 context->start = context->iter;
981 /* Nothing left after finishing the leftover char, or nothing
982 * passed in to begin with.
984 if (context->current_text_len == 0)
987 /* find_current_text_end () assumes the string starts at
988 * a character start, so we need to validate at least
989 * that much. It doesn't assume any following bytes
992 if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
996 G_MARKUP_ERROR_BAD_UTF8,
997 _("Invalid UTF-8 encoded text"));
1001 /* Initialize context->current_text_end, possibly adjusting
1002 * current_text_len, and add any leftover char portion
1004 find_current_text_end (context);
1006 /* Validate UTF8 (must be done after we find the end, since
1007 * we could have a trailing incomplete char)
1009 if (!g_utf8_validate (context->current_text,
1010 context->current_text_len,
1015 p = context->current_text;
1016 while (p != context->current_text_end)
1023 context->line_number += newlines;
1027 G_MARKUP_ERROR_BAD_UTF8,
1028 _("Invalid UTF-8 encoded text"));
1032 while (context->iter != context->current_text_end)
1034 switch (context->state)
1037 /* Possible next state: AFTER_OPEN_ANGLE */
1039 g_assert (context->tag_stack == NULL);
1041 /* whitespace is ignored outside of any elements */
1042 skip_spaces (context);
1044 if (context->iter != context->current_text_end)
1046 if (*context->iter == '<')
1048 /* Move after the open angle */
1049 advance_char (context);
1051 context->state = STATE_AFTER_OPEN_ANGLE;
1053 /* this could start a passthrough */
1054 context->start = context->iter;
1056 /* document is now non-empty */
1057 context->document_empty = FALSE;
1063 G_MARKUP_ERROR_PARSE,
1064 _("Document must begin with an element (e.g. <book>)"));
1069 case STATE_AFTER_OPEN_ANGLE:
1070 /* Possible next states: INSIDE_OPEN_TAG_NAME,
1071 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
1073 if (*context->iter == '?' ||
1074 *context->iter == '!')
1076 /* include < in the passthrough */
1077 const gchar *openangle = "<";
1078 add_to_partial (context, openangle, openangle + 1);
1079 context->start = context->iter;
1080 context->balance = 1;
1081 context->state = STATE_INSIDE_PASSTHROUGH;
1083 else if (*context->iter == '/')
1086 advance_char (context);
1088 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
1090 else if (is_name_start_char (context->iter))
1092 context->state = STATE_INSIDE_OPEN_TAG_NAME;
1094 /* start of tag name */
1095 context->start = context->iter;
1103 G_MARKUP_ERROR_PARSE,
1104 _("'%s' is not a valid character following "
1105 "a '<' character; it may not begin an "
1107 utf8_str (context->iter, buf));
1111 /* The AFTER_CLOSE_ANGLE state is actually sort of
1112 * broken, because it doesn't correspond to a range
1113 * of characters in the input stream as the others do,
1114 * and thus makes things harder to conceptualize
1116 case STATE_AFTER_CLOSE_ANGLE:
1117 /* Possible next states: INSIDE_TEXT, STATE_START */
1118 if (context->tag_stack == NULL)
1120 context->start = NULL;
1121 context->state = STATE_START;
1125 context->start = context->iter;
1126 context->state = STATE_INSIDE_TEXT;
1130 case STATE_AFTER_ELISION_SLASH:
1131 /* Possible next state: AFTER_CLOSE_ANGLE */
1134 /* We need to pop the tag stack and call the end_element
1135 * function, since this is the close tag
1137 GError *tmp_error = NULL;
1139 g_assert (context->tag_stack != NULL);
1142 if (context->parser->end_element)
1143 (* context->parser->end_element) (context,
1144 context->tag_stack->data,
1150 mark_error (context, tmp_error);
1151 g_propagate_error (error, tmp_error);
1155 if (*context->iter == '>')
1157 /* move after the close angle */
1158 advance_char (context);
1159 context->state = STATE_AFTER_CLOSE_ANGLE;
1167 G_MARKUP_ERROR_PARSE,
1168 _("Odd character '%s', expected a '>' character "
1169 "to end the start tag of element '%s'"),
1170 utf8_str (context->iter, buf),
1171 current_element (context));
1175 g_free (context->tag_stack->data);
1176 context->tag_stack = g_slist_delete_link (context->tag_stack,
1177 context->tag_stack);
1181 case STATE_INSIDE_OPEN_TAG_NAME:
1182 /* Possible next states: BETWEEN_ATTRIBUTES */
1184 /* if there's a partial chunk then it's the first part of the
1185 * tag name. If there's a context->start then it's the start
1186 * of the tag name in current_text, the partial chunk goes
1187 * before that start though.
1189 advance_to_name_end (context);
1191 if (context->iter == context->current_text_end)
1193 /* The name hasn't necessarily ended. Merge with
1194 * partial chunk, leave state unchanged.
1196 add_to_partial (context, context->start, context->iter);
1200 /* The name has ended. Combine it with the partial chunk
1201 * if any; push it on the stack; enter next state.
1203 add_to_partial (context, context->start, context->iter);
1204 context->tag_stack =
1205 g_slist_prepend (context->tag_stack,
1206 g_string_free (context->partial_chunk,
1209 context->partial_chunk = NULL;
1211 context->state = STATE_BETWEEN_ATTRIBUTES;
1212 context->start = NULL;
1216 case STATE_INSIDE_ATTRIBUTE_NAME:
1217 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1219 advance_to_name_end (context);
1220 add_to_partial (context, context->start, context->iter);
1222 /* read the full name, if we enter the equals sign state
1223 * then add the attribute to the list (without the value),
1224 * otherwise store a partial chunk to be prepended later.
1226 if (context->iter != context->current_text_end)
1227 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1230 case STATE_AFTER_ATTRIBUTE_NAME:
1231 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1233 skip_spaces (context);
1235 if (context->iter != context->current_text_end)
1237 /* The name has ended. Combine it with the partial chunk
1238 * if any; push it on the stack; enter next state.
1240 add_attribute (context, g_string_free (context->partial_chunk, FALSE));
1242 context->partial_chunk = NULL;
1243 context->start = NULL;
1245 if (*context->iter == '=')
1247 advance_char (context);
1248 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1256 G_MARKUP_ERROR_PARSE,
1257 _("Odd character '%s', expected a '=' after "
1258 "attribute name '%s' of element '%s'"),
1259 utf8_str (context->iter, buf),
1260 current_attribute (context),
1261 current_element (context));
1267 case STATE_BETWEEN_ATTRIBUTES:
1268 /* Possible next states: AFTER_CLOSE_ANGLE,
1269 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1271 skip_spaces (context);
1273 if (context->iter != context->current_text_end)
1275 if (*context->iter == '/')
1277 advance_char (context);
1278 context->state = STATE_AFTER_ELISION_SLASH;
1280 else if (*context->iter == '>')
1283 advance_char (context);
1284 context->state = STATE_AFTER_CLOSE_ANGLE;
1286 else if (is_name_start_char (context->iter))
1288 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1289 /* start of attribute name */
1290 context->start = context->iter;
1298 G_MARKUP_ERROR_PARSE,
1299 _("Odd character '%s', expected a '>' or '/' "
1300 "character to end the start tag of "
1301 "element '%s', or optionally an attribute; "
1302 "perhaps you used an invalid character in "
1303 "an attribute name"),
1304 utf8_str (context->iter, buf),
1305 current_element (context));
1308 /* If we're done with attributes, invoke
1309 * the start_element callback
1311 if (context->state == STATE_AFTER_ELISION_SLASH ||
1312 context->state == STATE_AFTER_CLOSE_ANGLE)
1314 const gchar *start_name;
1315 /* Ugly, but the current code expects an empty array instead of NULL */
1316 const gchar *empty = NULL;
1317 const gchar **attr_names = ∅
1318 const gchar **attr_values = ∅
1321 /* Call user callback for element start */
1322 start_name = current_element (context);
1324 if (context->cur_attr >= 0)
1326 attr_names = (const gchar**)context->attr_names;
1327 attr_values = (const gchar**)context->attr_values;
1331 if (context->parser->start_element)
1332 (* context->parser->start_element) (context,
1334 (const gchar **)attr_names,
1335 (const gchar **)attr_values,
1339 /* Go ahead and free the attributes. */
1340 for (; context->cur_attr >= 0; context->cur_attr--)
1342 int pos = context->cur_attr;
1343 g_free (context->attr_names[pos]);
1344 g_free (context->attr_values[pos]);
1345 context->attr_names[pos] = context->attr_values[pos] = NULL;
1347 g_assert (context->cur_attr == -1);
1348 g_assert (context->attr_names == NULL ||
1349 context->attr_names[0] == NULL);
1350 g_assert (context->attr_values == NULL ||
1351 context->attr_values[0] == NULL);
1353 if (tmp_error != NULL)
1355 mark_error (context, tmp_error);
1356 g_propagate_error (error, tmp_error);
1362 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1363 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1365 skip_spaces (context);
1367 if (context->iter != context->current_text_end)
1369 if (*context->iter == '"')
1371 advance_char (context);
1372 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1373 context->start = context->iter;
1375 else if (*context->iter == '\'')
1377 advance_char (context);
1378 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1379 context->start = context->iter;
1387 G_MARKUP_ERROR_PARSE,
1388 _("Odd character '%s', expected an open quote mark "
1389 "after the equals sign when giving value for "
1390 "attribute '%s' of element '%s'"),
1391 utf8_str (context->iter, buf),
1392 current_attribute (context),
1393 current_element (context));
1398 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1399 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1400 /* Possible next states: BETWEEN_ATTRIBUTES */
1404 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1415 if (*context->iter == delim)
1418 while (advance_char (context));
1420 if (context->iter == context->current_text_end)
1422 /* The value hasn't necessarily ended. Merge with
1423 * partial chunk, leave state unchanged.
1425 add_to_partial (context, context->start, context->iter);
1429 /* The value has ended at the quote mark. Combine it
1430 * with the partial chunk if any; set it for the current
1435 add_to_partial (context, context->start, context->iter);
1437 g_assert (context->cur_attr >= 0);
1439 if (unescape_text (context,
1440 context->partial_chunk->str,
1441 context->partial_chunk->str +
1442 context->partial_chunk->len,
1446 /* success, advance past quote and set state. */
1447 context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
1448 advance_char (context);
1449 context->state = STATE_BETWEEN_ATTRIBUTES;
1450 context->start = NULL;
1453 truncate_partial (context);
1457 case STATE_INSIDE_TEXT:
1458 /* Possible next states: AFTER_OPEN_ANGLE */
1461 if (*context->iter == '<')
1464 while (advance_char (context));
1466 /* The text hasn't necessarily ended. Merge with
1467 * partial chunk, leave state unchanged.
1470 add_to_partial (context, context->start, context->iter);
1472 if (context->iter != context->current_text_end)
1474 GString *unescaped = NULL;
1476 /* The text has ended at the open angle. Call the text
1480 if (unescape_text (context,
1481 context->partial_chunk->str,
1482 context->partial_chunk->str +
1483 context->partial_chunk->len,
1487 GError *tmp_error = NULL;
1489 if (context->parser->text)
1490 (*context->parser->text) (context,
1496 g_string_free (unescaped, TRUE);
1498 if (tmp_error == NULL)
1500 /* advance past open angle and set state. */
1501 advance_char (context);
1502 context->state = STATE_AFTER_OPEN_ANGLE;
1503 /* could begin a passthrough */
1504 context->start = context->iter;
1508 mark_error (context, tmp_error);
1509 g_propagate_error (error, tmp_error);
1513 truncate_partial (context);
1517 case STATE_AFTER_CLOSE_TAG_SLASH:
1518 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1519 if (is_name_start_char (context->iter))
1521 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1523 /* start of tag name */
1524 context->start = context->iter;
1532 G_MARKUP_ERROR_PARSE,
1533 _("'%s' is not a valid character following "
1534 "the characters '</'; '%s' may not begin an "
1536 utf8_str (context->iter, buf),
1537 utf8_str (context->iter, buf));
1541 case STATE_INSIDE_CLOSE_TAG_NAME:
1542 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1543 advance_to_name_end (context);
1544 add_to_partial (context, context->start, context->iter);
1546 if (context->iter != context->current_text_end)
1547 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1550 case STATE_AFTER_CLOSE_TAG_NAME:
1551 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1553 skip_spaces (context);
1555 if (context->iter != context->current_text_end)
1559 /* The name has ended. Combine it with the partial chunk
1560 * if any; check that it matches stack top and pop
1561 * stack; invoke proper callback; enter next state.
1563 close_name = g_string_free (context->partial_chunk, FALSE);
1564 context->partial_chunk = NULL;
1566 if (*context->iter != '>')
1572 G_MARKUP_ERROR_PARSE,
1573 _("'%s' is not a valid character following "
1574 "the close element name '%s'; the allowed "
1575 "character is '>'"),
1576 utf8_str (context->iter, buf),
1579 else if (context->tag_stack == NULL)
1583 G_MARKUP_ERROR_PARSE,
1584 _("Element '%s' was closed, no element "
1585 "is currently open"),
1588 else if (strcmp (close_name, current_element (context)) != 0)
1592 G_MARKUP_ERROR_PARSE,
1593 _("Element '%s' was closed, but the currently "
1594 "open element is '%s'"),
1596 current_element (context));
1601 advance_char (context);
1602 context->state = STATE_AFTER_CLOSE_ANGLE;
1603 context->start = NULL;
1605 /* call the end_element callback */
1607 if (context->parser->end_element)
1608 (* context->parser->end_element) (context,
1614 /* Pop the tag stack */
1615 g_free (context->tag_stack->data);
1616 context->tag_stack = g_slist_delete_link (context->tag_stack,
1617 context->tag_stack);
1621 mark_error (context, tmp_error);
1622 g_propagate_error (error, tmp_error);
1626 g_free (close_name);
1630 case STATE_INSIDE_PASSTHROUGH:
1631 /* Possible next state: AFTER_CLOSE_ANGLE */
1634 if (*context->iter == '<')
1636 if (*context->iter == '>')
1639 add_to_partial (context, context->start, context->iter);
1640 context->start = context->iter;
1641 if ((g_str_has_prefix (context->partial_chunk->str, "<?")
1642 && g_str_has_suffix (context->partial_chunk->str, "?")) ||
1643 (g_str_has_prefix (context->partial_chunk->str, "<!--")
1644 && g_str_has_suffix (context->partial_chunk->str, "--")) ||
1645 (g_str_has_prefix (context->partial_chunk->str, "<![CDATA[")
1646 && g_str_has_suffix (context->partial_chunk->str, "]]")) ||
1647 (g_str_has_prefix (context->partial_chunk->str, "<!DOCTYPE")
1648 && context->balance == 0))
1652 while (advance_char (context));
1654 if (context->iter == context->current_text_end)
1656 /* The passthrough hasn't necessarily ended. Merge with
1657 * partial chunk, leave state unchanged.
1659 add_to_partial (context, context->start, context->iter);
1663 /* The passthrough has ended at the close angle. Combine
1664 * it with the partial chunk if any. Call the passthrough
1665 * callback. Note that the open/close angles are
1666 * included in the text of the passthrough.
1668 GError *tmp_error = NULL;
1670 advance_char (context); /* advance past close angle */
1671 add_to_partial (context, context->start, context->iter);
1673 if (context->parser->passthrough)
1674 (*context->parser->passthrough) (context,
1675 context->partial_chunk->str,
1676 context->partial_chunk->len,
1680 truncate_partial (context);
1682 if (tmp_error == NULL)
1684 context->state = STATE_AFTER_CLOSE_ANGLE;
1685 context->start = context->iter; /* could begin text */
1689 mark_error (context, tmp_error);
1690 g_propagate_error (error, tmp_error);
1700 g_assert_not_reached ();
1706 context->parsing = FALSE;
1708 return context->state != STATE_ERROR;
1712 * g_markup_parse_context_end_parse:
1713 * @context: a #GMarkupParseContext
1714 * @error: return location for a #GError
1716 * Signals to the #GMarkupParseContext that all data has been
1717 * fed into the parse context with g_markup_parse_context_parse().
1718 * This function reports an error if the document isn't complete,
1719 * for example if elements are still open.
1721 * Return value: %TRUE on success, %FALSE if an error was set
1724 g_markup_parse_context_end_parse (GMarkupParseContext *context,
1727 g_return_val_if_fail (context != NULL, FALSE);
1728 g_return_val_if_fail (!context->parsing, FALSE);
1729 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
1731 if (context->partial_chunk != NULL)
1733 g_string_free (context->partial_chunk, TRUE);
1734 context->partial_chunk = NULL;
1737 if (context->document_empty)
1739 set_error (context, error, G_MARKUP_ERROR_EMPTY,
1740 _("Document was empty or contained only whitespace"));
1744 context->parsing = TRUE;
1746 switch (context->state)
1752 case STATE_AFTER_OPEN_ANGLE:
1753 set_error (context, error, G_MARKUP_ERROR_PARSE,
1754 _("Document ended unexpectedly just after an open angle bracket '<'"));
1757 case STATE_AFTER_CLOSE_ANGLE:
1758 if (context->tag_stack != NULL)
1760 /* Error message the same as for INSIDE_TEXT */
1761 set_error (context, error, G_MARKUP_ERROR_PARSE,
1762 _("Document ended unexpectedly with elements still open - "
1763 "'%s' was the last element opened"),
1764 current_element (context));
1768 case STATE_AFTER_ELISION_SLASH:
1769 set_error (context, error, G_MARKUP_ERROR_PARSE,
1770 _("Document ended unexpectedly, expected to see a close angle "
1771 "bracket ending the tag <%s/>"), current_element (context));
1774 case STATE_INSIDE_OPEN_TAG_NAME:
1775 set_error (context, error, G_MARKUP_ERROR_PARSE,
1776 _("Document ended unexpectedly inside an element name"));
1779 case STATE_INSIDE_ATTRIBUTE_NAME:
1780 set_error (context, error, G_MARKUP_ERROR_PARSE,
1781 _("Document ended unexpectedly inside an attribute name"));
1784 case STATE_BETWEEN_ATTRIBUTES:
1785 set_error (context, error, G_MARKUP_ERROR_PARSE,
1786 _("Document ended unexpectedly inside an element-opening "
1790 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1791 set_error (context, error, G_MARKUP_ERROR_PARSE,
1792 _("Document ended unexpectedly after the equals sign "
1793 "following an attribute name; no attribute value"));
1796 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1797 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1798 set_error (context, error, G_MARKUP_ERROR_PARSE,
1799 _("Document ended unexpectedly while inside an attribute "
1803 case STATE_INSIDE_TEXT:
1804 g_assert (context->tag_stack != NULL);
1805 set_error (context, error, G_MARKUP_ERROR_PARSE,
1806 _("Document ended unexpectedly with elements still open - "
1807 "'%s' was the last element opened"),
1808 current_element (context));
1811 case STATE_AFTER_CLOSE_TAG_SLASH:
1812 case STATE_INSIDE_CLOSE_TAG_NAME:
1813 set_error (context, error, G_MARKUP_ERROR_PARSE,
1814 _("Document ended unexpectedly inside the close tag for "
1815 "element '%s'"), current_element);
1818 case STATE_INSIDE_PASSTHROUGH:
1819 set_error (context, error, G_MARKUP_ERROR_PARSE,
1820 _("Document ended unexpectedly inside a comment or "
1821 "processing instruction"));
1826 g_assert_not_reached ();
1830 context->parsing = FALSE;
1832 return context->state != STATE_ERROR;
1836 * g_markup_parse_context_get_element:
1837 * @context: a #GMarkupParseContext
1838 * @returns: the name of the currently open element, or %NULL
1840 * Retrieves the name of the currently open element.
1844 G_CONST_RETURN gchar *
1845 g_markup_parse_context_get_element (GMarkupParseContext *context)
1847 g_return_val_if_fail (context != NULL, NULL);
1849 if (context->tag_stack == NULL)
1852 return current_element (context);
1856 * g_markup_parse_context_get_position:
1857 * @context: a #GMarkupParseContext
1858 * @line_number: return location for a line number, or %NULL
1859 * @char_number: return location for a char-on-line number, or %NULL
1861 * Retrieves the current line number and the number of the character on
1862 * that line. Intended for use in error messages; there are no strict
1863 * semantics for what constitutes the "current" line number other than
1864 * "the best number we could come up with for error messages."
1868 g_markup_parse_context_get_position (GMarkupParseContext *context,
1872 g_return_if_fail (context != NULL);
1875 *line_number = context->line_number;
1878 *char_number = context->char_number;
1882 append_escaped_text (GString *str,
1890 end = text + length;
1895 next = g_utf8_next_char (p);
1900 g_string_append (str, "&");
1904 g_string_append (str, "<");
1908 g_string_append (str, ">");
1912 g_string_append (str, "'");
1916 g_string_append (str, """);
1920 g_string_append_len (str, p, next - p);
1929 * g_markup_escape_text:
1930 * @text: some valid UTF-8 text
1931 * @length: length of @text in bytes
1933 * Escapes text so that the markup parser will parse it verbatim.
1934 * Less than, greater than, ampersand, etc. are replaced with the
1935 * corresponding entities. This function would typically be used
1936 * when writing out a file to be parsed with the markup parser.
1938 * Note that this function doesn't protect whitespace and line endings
1939 * from being processed according to the XML rules for normalization
1940 * of line endings and attribute values.
1942 * Return value: escaped text
1945 g_markup_escape_text (const gchar *text,
1950 g_return_val_if_fail (text != NULL, NULL);
1953 length = strlen (text);
1955 /* prealloc at least as long as original text */
1956 str = g_string_sized_new (length);
1957 append_escaped_text (str, text, length);
1959 return g_string_free (str, FALSE);
1964 * @format: a printf-style format string
1965 * @after: location to store a pointer to the character after
1966 * the returned conversion. On a %NULL return, returns the
1967 * pointer to the trailing NUL in the string
1969 * Find the next conversion in a printf-style format string.
1970 * Partially based on code from printf-parser.c,
1971 * Copyright (C) 1999-2000, 2002-2003 Free Software Foundation, Inc.
1973 * Return value: pointer to the next conversion in @format,
1974 * or %NULL, if none.
1977 find_conversion (const char *format,
1980 const char *start = format;
1983 while (*start != '\0' && *start != '%')
2000 /* Test for positional argument. */
2001 if (*cp >= '0' && *cp <= '9')
2005 for (np = cp; *np >= '0' && *np <= '9'; np++)
2011 /* Skip the flags. */
2025 /* Skip the field width. */
2030 /* Test for positional argument. */
2031 if (*cp >= '0' && *cp <= '9')
2035 for (np = cp; *np >= '0' && *np <= '9'; np++)
2043 for (; *cp >= '0' && *cp <= '9'; cp++)
2047 /* Skip the precision. */
2053 /* Test for positional argument. */
2054 if (*cp >= '0' && *cp <= '9')
2058 for (np = cp; *np >= '0' && *np <= '9'; np++)
2066 for (; *cp >= '0' && *cp <= '9'; cp++)
2071 /* Skip argument type/size specifiers. */
2072 while (*cp == 'h' ||
2081 /* Skip the conversion character. */
2089 * g_markup_vprintf_escaped:
2090 * @format: printf() style format string
2091 * @args: variable argument list, similar to vprintf()
2093 * Formats the data in @args according to @format, escaping
2094 * all string and character arguments in the fashion
2095 * of g_markup_escape_text(). See g_markup_printf_escaped().
2097 * Return value: newly allocated result from formatting
2098 * operation. Free with g_free().
2103 g_markup_vprintf_escaped (const char *format,
2108 GString *result = NULL;
2109 gchar *output1 = NULL;
2110 gchar *output2 = NULL;
2111 const char *p, *op1, *op2;
2114 /* The technique here, is that we make two format strings that
2115 * have the identical conversions in the identical order to the
2116 * original strings, but differ in the text in-between. We
2117 * then use the normal g_strdup_vprintf() to format the arguments
2118 * with the two new format strings. By comparing the results,
2119 * we can figure out what segments of the output come from
2120 * the the original format string, and what from the arguments,
2121 * and thus know what portions of the string to escape.
2123 * For instance, for:
2125 * g_markup_printf_escaped ("%s ate %d apples", "Susan & Fred", 5);
2127 * We form the two format strings "%sX%dX" and %sY%sY". The results
2128 * of formatting with those two strings are
2130 * "%sX%dX" => "Susan & FredX5X"
2131 * "%sY%dY" => "Susan & FredY5Y"
2133 * To find the span of the first argument, we find the first position
2134 * where the two arguments differ, which tells us that the first
2135 * argument formatted to "Susan & Fred". We then escape that
2136 * to "Susan & Fred" and join up with the intermediate portions
2137 * of the format string and the second argument to get
2138 * "Susan & Fred ate 5 apples".
2141 /* Create the two modified format strings
2143 format1 = g_string_new (NULL);
2144 format2 = g_string_new (NULL);
2149 const char *conv = find_conversion (p, &after);
2153 g_string_append_len (format1, conv, after - conv);
2154 g_string_append_c (format1, 'X');
2155 g_string_append_len (format2, conv, after - conv);
2156 g_string_append_c (format2, 'Y');
2161 /* Use them to format the arguments
2163 G_VA_COPY (args2, args);
2165 output1 = g_strdup_vprintf (format1->str, args);
2170 output2 = g_strdup_vprintf (format2->str, args2);
2175 result = g_string_new (NULL);
2177 /* Iterate through the original format string again,
2178 * copying the non-conversion portions and the escaped
2179 * converted arguments to the output string.
2187 const char *output_start;
2188 const char *conv = find_conversion (p, &after);
2191 if (!conv) /* The end, after points to the trailing \0 */
2193 g_string_append_len (result, p, after - p);
2197 g_string_append_len (result, p, conv - p);
2199 while (*op1 == *op2)
2205 escaped = g_markup_escape_text (output_start, op1 - output_start);
2206 g_string_append (result, escaped);
2215 g_string_free (format1, TRUE);
2216 g_string_free (format2, TRUE);
2221 return g_string_free (result, FALSE);
2227 * g_markup_printf_escaped:
2228 * @format: printf() style format string
2229 * @Varargs: the arguments to insert in the format string
2231 * Formats arguments according to @format, escaping
2232 * all string and character arguments in the fashion
2233 * of g_markup_escape_text(). This is useful when you
2234 * want to insert literal strings into XML-style markup
2235 * output, without having to worry that the strings
2236 * might themselves contain markup.
2238 * <informalexample><programlisting>
2239 * const char *store = "Fortnum & Mason";
2240 * const char *item = "Tea";
2243 * output = g_markup_printf_escaped ("<purchase>"
2244 * "<store>%s</store>"
2245 * "<item>%s</item>"
2246 * "</purchase>",
2248 * </programlisting></informalexample>
2250 * Return value: newly allocated result from formatting
2251 * operation. Free with g_free().
2256 g_markup_printf_escaped (const char *format, ...)
2261 va_start (args, format);
2262 result = g_markup_vprintf_escaped (format, args);