1 /* GIO - GLib Input, Output and Streaming Library
3 * Copyright (C) 2009 Paolo Borelli
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General
16 * Public License along with this library; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
18 * Boston, MA 02111-1307, USA.
20 * Author: Paolo Borelli <pborelli@gnome.org>
24 #include "gutf8inputstream.h"
25 #include "ginputstream.h"
26 #include "gcancellable.h"
33 * SECTION:gutf8inputstream
34 * @short_description: Input Stream performing UTF8 validation
36 * @see_also: #GFilterInputStream, #GInputStream
38 * utf8 input stream implements #GFilterInputStream and provides
39 * UTF8 validation of the data read from a the stream.
40 * If the supplied buffer is long enough (see below), the returned
41 * data is guaranteed to end at utf8 character boundaries.
44 * Extra care must be taken when performing "small" reads:
45 * unless you have control of the data being read, you need
46 * to always supply a buffer long at least 6 bytes, otherwise
47 * the returned content may be an incomplete utf8 byte sequence.
51 * To create an utf8 input stream, use g_utf8_input_stream_new().
56 #define MAX_UNICHAR_LEN 6
58 struct _GUtf8InputStreamPrivate {
59 /* buffer containing trailing partial character not yet returned */
60 char buffer[MAX_UNICHAR_LEN];
63 /* buffer containing partial character returned in a "small read"
64 * but not yet validated */
65 char small_read_buffer[MAX_UNICHAR_LEN];
69 static gssize g_utf8_input_stream_read (GInputStream *stream,
72 GCancellable *cancellable,
75 G_DEFINE_TYPE (GUtf8InputStream,
77 G_TYPE_FILTER_INPUT_STREAM)
81 g_utf8_input_stream_class_init (GUtf8InputStreamClass *klass)
83 GInputStreamClass *istream_class;
85 g_type_class_add_private (klass, sizeof (GUtf8InputStreamPrivate));
87 istream_class = G_INPUT_STREAM_CLASS (klass);
88 istream_class->read_fn = g_utf8_input_stream_read;
92 g_utf8_input_stream_init (GUtf8InputStream *stream)
94 stream->priv = G_TYPE_INSTANCE_GET_PRIVATE (stream,
95 G_TYPE_UTF8_INPUT_STREAM,
96 GUtf8InputStreamPrivate);
100 * g_utf8_input_stream_new:
101 * @base_stream: a #GInputStream.
103 * Creates a new #GUtf8InputStream from the given @base_stream.
105 * Returns: a #GInputStream for the given @base_stream.
110 g_utf8_input_stream_new (GInputStream *base_stream)
112 GInputStream *stream;
114 g_return_val_if_fail (G_IS_INPUT_STREAM (base_stream), NULL);
116 stream = g_object_new (G_TYPE_UTF8_INPUT_STREAM,
117 "base-stream", base_stream,
124 store_remainder (GUtf8InputStream *stream,
125 const char *remainder,
128 GUtf8InputStreamPrivate *priv;
133 /* we store a remanainder only after having
134 * consumed the previous */
135 g_assert (priv->len == 0);
137 for (i = 0; i < len; ++i)
138 priv->buffer[i] = remainder[i];
143 get_remainder (GUtf8InputStream *stream,
147 GUtf8InputStreamPrivate *priv;
153 g_assert (priv->len < MAX_UNICHAR_LEN);
155 len = MIN (count, priv->len);
156 for (i = 0; i < len; ++i)
157 buffer[i] = priv->buffer[i];
160 /* if there is more remainder, move it at the start */
161 for (i = 0; i < (priv->len - res); ++i)
162 priv->buffer[i] = priv->buffer[res + i];
169 store_small_read (GUtf8InputStream *stream,
173 GUtf8InputStreamPrivate *priv;
178 /* if we reach MAX_UNICHAR_LEN it is either valid
179 * or invalid, so we should already have removed it
181 g_assert (priv->small_read_len + len < MAX_UNICHAR_LEN);
183 for (i = 0; i < len; ++i)
184 priv->small_read_buffer[priv->small_read_len + i] = buffer[i];
185 priv->small_read_len += i;
188 /* Combines the current "small read" buffer with the new
189 * bytes given, validates the buffer and if needed
193 * the number of bytes of buffer that are needed to
194 * make the current small read buffer valid.
196 * -1 if the small read buffer is invalid
198 * 0 if it is an incomplete character or if the
199 * small read buffer is empty.
202 validate_small_read (GUtf8InputStream *stream,
206 GUtf8InputStreamPrivate *priv;
214 if (priv->small_read_len == 0)
217 for (i = 0; i < MIN (len, MAX_UNICHAR_LEN - priv->small_read_len); ++i)
218 priv->small_read_buffer[priv->small_read_len + i] = buffer[i];
220 c = g_utf8_get_char_validated (priv->small_read_buffer, priv->small_read_len + i);
221 if (c == (gunichar)-1)
223 priv->small_read_len = 0;
226 if (c == (gunichar)-2)
231 p = g_utf8_next_char (priv->small_read_buffer);
232 res = p - (priv->small_read_buffer + priv->small_read_len);
236 /* reset the buffer */
237 priv->small_read_len = 0;
243 g_utf8_input_stream_read (GInputStream *stream,
246 GCancellable *cancellable,
249 GUtf8InputStream *ustream;
250 GUtf8InputStreamPrivate *priv;
251 GInputStream *base_stream;
252 gsize nvalid, remainder;
253 gssize oldread, nread, offset;
257 ustream = G_UTF8_INPUT_STREAM (stream);
258 priv = ustream->priv;
260 /* if we had previous incomplete data put it at the start of the buffer */
261 oldread = get_remainder (ustream, buffer, count);
263 /* if we have already reached count, it is "small read":
264 * store it to validate later */
265 if (oldread == count)
267 store_small_read (ustream, buffer, oldread);
271 base_stream = g_filter_input_stream_get_base_stream (G_FILTER_INPUT_STREAM (stream));
273 nread = g_input_stream_read (base_stream,
274 (char *)buffer + oldread,
282 /* take into account bytes we put in the buffer */
286 /* validate previous small reads */
287 offset = validate_small_read (ustream, buffer, nread);
292 valid = g_utf8_validate ((char *)buffer + offset, nread - offset, &end);
293 nvalid = end - (char *)buffer;
298 remainder = nread - nvalid;
300 /* if validation failed in the last bytes and the byte
301 * sequence is an incomplete character and EOF is not reached,
302 * try to read further to see if we stopped in the middle
304 if ((remainder < MAX_UNICHAR_LEN) &&
306 (g_utf8_get_char_validated ((char *)buffer + nvalid, remainder) == (gunichar)-2))
310 /* A "small" read: store it to validate later */
311 store_small_read (ustream, buffer, nread);
315 store_remainder (ustream, (char *)buffer + nvalid, remainder);
321 g_set_error (error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA,
322 _("Invalid UTF-8 sequence in input"));
326 #define __G_UTF8_INPUT_STREAM_C__
327 #include "gioaliasdef.c"