2 * libiri: An IRI/URI/URL parsing library
7 * Copyright (c) 2005, 2008 Mo McRoberts.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. The names of the author(s) of this software may not be used to endorse
18 * or promote products derived from this software without specific prior
21 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
22 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
23 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
24 * AUTHORS OF THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 #define _ALIGN(x) ((((x)+(ALIGNMENT-1))&~(ALIGNMENT-1)))
45 #define ALIGN(x) (char *) _ALIGN((size_t) x)
48 iri__hexnibble(char c)
50 if(c >= '0' && c <= '9')
54 if(c >= 'A' && c <= 'F')
58 if(c >= 'a' && c <= 'f')
65 static inline const char *
66 iri__copychar(char **dest, const char *src)
74 /* TODO: Punycode decoding for the host part */
75 static inline const char *
76 iri__copychar_decode(char **dest, const char *src, int convert_space)
78 unsigned char *p = (unsigned char *) (*dest);
80 if(1 == convert_space && '+' == *src)
86 if(0 == isxdigit(src[1]) || 0 == isxdigit(src[2]))
88 /* TODO: Deal with %u<nnnn> non-standard encoding - be liberal in
89 * what you accept, etc.
95 *p = (iri__hexnibble(src[1]) << 4) | iri__hexnibble(src[2]);
109 iri__allocbuf(const char *src, size_t *len)
114 Internal format of IRI structure is very hard to understand at first.
115 The buffer is used to store character strings with every parsed part of
116 IRI, like host, user, auth, path etc. Start of every character string is
117 ALIGNED to ALIGNMENT value and finished with NULL byte.
118 Above that, the buffer is used to keep variable size array of parsed
119 scheme parts. It consist of the array of addresses pointing to starts
120 of scheme parts which are kept as all other characters strings, so are
121 aligned to ALIGMENT and ended with NULL byte.
122 This function calculates approximation of buffer size to store all the
125 Fully filled buffer with scheme parts looks as follows:
126 0. start of the buffer
127 1. aligned start of the scheme part with added NULL byte
128 2. aligned start of the user part with added NULL byte
129 3. aligned start of the password part with added NULL byte
130 4. aligned start of the array of size schemes_number+1 of pointers that point
131 to consecutive scheme part character strings (last one is NULL)
132 schemes_number is a number of scheme tokens delimited with + sign in
134 5. schems_number of characters strings of scheme parts each of which
135 aligned and finished with NULL byte.
136 6. aligned start of the host part with added NULL byte
137 7. aligned start of the path part with added NULL byte
138 8. aligned start of the query part with added NULL byte
139 9. aligned start of the anchor part with added NULL byte
141 There can be indentified 4 kinds of characters in IRI:
142 - characters which are copied one to one (i.e. letters)
143 - characters which are removed (special characters like comma in scheme)
144 - characters which are replaced with other characers where buffer grows
145 this only happens with scheme part
146 - characters which are replaced with other characers where buffer decreases
148 Alighning a pointer in worst case will advance a buffer pointers
151 Knowing all that we can count an approximation of buffer size which can
152 be trusted that whole parsed IRI content will fit in.
155 /* first approximation - all characers will have to be stored in buffer */
158 /* second approximation - IRI has all possible parts which have to be
159 * aligned to ALIGNMENT and have NULL byte an the end. There are 7 different
161 *len += 7 * (ALIGNMENT-1 + 1);
163 /* third approximation - we have to make a room for scheme parts array.
164 * Because the array has an aligned array of n + 1 pointers and n
165 * characters strings aligned and NULL byte terminated.
167 if(NULL != (c = strchr(src, ':')))
170 for(p = src; p < c; p++)
177 /* fourth approximation - all characters of scheme part will be stored
178 * in scheme parts tokens */
181 /* fifth approximation - Ensure we can align each element on an
182 * ALIGNMENT byte boundary and append NULL byte */
183 *len += sc * (ALIGNMENT-1 + 1);
185 /* sixth approximation - Ensure we have a room for aligned array
187 *len += ALIGNMENT-1 + (sc + 1) * (sizeof(char*)/sizeof(char));
189 return (char *) calloc(1, *len);
193 iri_parse(const char *src)
196 char *bufstart, *endp, *bufp, **sl;
197 const char *at, *colon, *slash, *t, *slash3rd;
198 size_t buflen, sc, cp;
200 if(NULL == (p = (iri_t *) calloc(1, sizeof(iri_t))))
204 if(NULL == (bufstart = iri__allocbuf(src, &buflen)))
209 p->base = bufp = bufstart;
211 at = strchr(src, '@');
212 slash = strchr(src, '/');
213 colon = strchr(src, ':');
214 if(slash && colon && slash < colon)
216 /* We can disregard the colon if a slash appears before it */
219 // "@" is valid character in hierarchical part of IRI
220 if(slash && colon && (colon[1] != '/' || colon[2] != '/'))
222 //if scheme not suffixed with ://, there is not autority
223 //therefore autority(and user within) is not set
226 else if(at && slash && slash[1] && slash[2])
228 slash3rd = strchr(slash + 2, '/');
229 //here we know scheme suffix is "://" so autority can exist
230 //3rd slash should match start of hierarchical part if exists
231 //@ after that is valid character
232 if(slash3rd && slash3rd < at)
239 /* Definitely a scheme */
241 p->iri.scheme = bufp;
242 while(*src && *src != ':')
244 src = iri__copychar_decode(&bufp, src, 0);
249 /* src[0-1] SHOULD == '/' */
250 if(src[0] == '/') src++;
251 if(src[0] == '/') src++;
253 else if(colon && at && colon < at)
255 fprintf(stderr, "Colon occurs before at\n");
256 /* This could be scheme://user[;auth][:password]@host or [scheme:]user[;auth][:password]@host (urgh) */
257 if(colon[1] == '/' && colon[2] == '/' && colon[3] != '/')
260 p->iri.scheme = bufp;
261 while(*src && *src != ':')
263 src = iri__copychar_decode(&bufp, src, 0);
268 /* src[0-1] SHOULD == '/' */
269 for(; *src == '/'; src++);
272 fprintf(stderr, "Found user\n");
276 fprintf(stderr, "Matched scheme\n");
278 p->iri.scheme = bufp;
280 while(*src && *src != ':' && *src != '@' && *src != ';')
282 src = iri__copychar_decode(&bufp, src, 0);
288 /* Following authentication parameters */
292 while(*src && *src != ':' && *src != '@')
294 /* Don't decode, so it can be extracted properly */
295 src = iri__copychar(&bufp, src);
302 /* Following password data */
305 p->iri.password = bufp;
306 while(*src && *src != ':' && *src != '@')
308 src = iri__copychar_decode(&bufp, src, 0);
315 /* It was actually scheme:user:auth@host */
316 p->iri.user = p->iri.auth;
318 p->iri.password = bufp;
319 while(*src && *src != '@')
321 src = iri__copychar_decode(&bufp, src, 0);
339 /* user[;auth]@host[/path...] */
342 while(*src != '@' && *src != ';')
344 src = iri__copychar_decode(&bufp, src, 0);
353 while(*src && *src != '@')
355 /* Don't decode, so it can be extracted properly */
356 src = iri__copychar(&bufp, src);
366 if(NULL != p->iri.scheme)
369 for(t = p->iri.scheme; *t; t++)
377 sl = (char **) (void *) bufp;
378 bufp += (sc + 1) * sizeof(char *);
383 for(t = p->iri.scheme; *t; t++)
411 p->iri.schemelist = (const char **) sl;
412 p->iri.nschemes = sc;
417 while(*src && *src != ':' && *src != '/' && *src != '?' && *src != '#')
419 src = iri__copychar_decode(&bufp, src, 0);
428 p->iri.port = strtol(src, &endp, 10);
435 while(*src && *src != '?' && *src != '#')
437 src = iri__copychar_decode(&bufp, src, 0);
447 while(*src && *src != '#')
449 /* Don't actually decode the query itself, otherwise it
450 * can't be reliably split */
451 src = iri__copychar(&bufp, src);
459 p->iri.anchor = bufp;
462 src = iri__copychar_decode(&bufp, src, 0);
469 /* Still stuff left? It must be a path... of sorts */
472 while(*src && *src != '?' && *src != '#')
474 src = iri__copychar_decode(&bufp, src, 0);