2 * libiri: An IRI/URI/URL parsing library
7 * Copyright (c) 2005, 2008 Mo McRoberts.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. The names of the author(s) of this software may not be used to endorse
18 * or promote products derived from this software without specific prior
21 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
22 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
23 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
24 * AUTHORS OF THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 #define _ALIGN(x) ((((x)+(ALIGNMENT-1))&~(ALIGNMENT-1)))
45 #define ALIGN(x) (char *) _ALIGN((size_t) x)
48 iri__hexnibble(char c)
50 if(c >= '0' && c <= '9')
54 if(c >= 'A' && c <= 'F')
58 if(c >= 'a' && c <= 'f')
64 static inline const char *
65 iri__copychar(char **dest, const char *src)
73 /* TODO: Punycode decoding for the host part */
74 static inline const char *
75 iri__copychar_decode(char **dest, const char *src, int convert_space)
77 unsigned char *p = (unsigned char *) (*dest);
79 if(1 == convert_space && '+' == *src)
85 if(0 == isxdigit(src[1]) || 0 == isxdigit(src[2]))
87 /* TODO: Deal with %u<nnnn> non-standard encoding - be liberal in
88 * what you accept, etc.
94 *p = (iri__hexnibble(src[1]) << 4) | iri__hexnibble(src[2]);
108 iri__allocbuf(const char *src, size_t *len)
113 /* Calculate the size of the buffer required to hold a decoded version of
114 * src, including enough breathing space for null bytes.
116 /* XXX: This is way too much; we need to actually count it */
117 *len = (strlen(src) * 4) + 16;
118 /* Determine how much space we need for the scheme list */
119 if(NULL != (c = strchr(src, ':')))
122 for(p = src; p < c; p++)
129 /* Ensure we can align each element on an 8-byte boundary */
130 *len = (src - c) + 1 + sc + ((sc + 1) * (sizeof(char *) + 7));
133 return (char *) calloc(1, *len);
137 iri_parse(const char *src)
140 char *bufstart, *endp, *bufp, **sl;
141 const char *at, *colon, *slash, *t;
142 size_t buflen, sc, cp;
144 if(NULL == (p = (iri_t *) calloc(1, sizeof(iri_t))))
148 if(NULL == (bufstart = iri__allocbuf(src, &buflen)))
153 p->base = bufp = bufstart;
155 at = strchr(src, '@');
156 slash = strchr(src, '/');
157 colon = strchr(src, ':');
158 if(slash && colon && slash < colon)
160 /* We can disregard the colon if a slash appears before it */
165 /* Definitely a scheme */
167 p->iri.scheme = bufp;
168 while(*src && *src != ':')
170 src = iri__copychar_decode(&bufp, src, 0);
175 /* src[0-1] SHOULD == '/' */
176 if(src[0] == '/') src++;
177 if(src[0] == '/') src++;
179 else if(colon && at && colon < at)
181 fprintf(stderr, "Colon occurs before at\n");
182 /* This could be scheme://user[;auth][:password]@host or [scheme:]user[;auth][:password]@host (urgh) */
183 if(colon[1] == '/' && colon[2] == '/' && colon[3] != '/')
186 p->iri.scheme = bufp;
187 while(*src && *src != ':')
189 src = iri__copychar_decode(&bufp, src, 0);
194 /* src[0-1] SHOULD == '/' */
195 for(; *src == '/'; src++);
198 fprintf(stderr, "Found user\n");
202 fprintf(stderr, "Matched scheme\n");
204 p->iri.scheme = bufp;
206 while(*src && *src != ':' && *src != '@' && *src != ';')
208 src = iri__copychar_decode(&bufp, src, 0);
214 /* Following authentication parameters */
218 while(*src && *src != ':' && *src != '@')
220 /* Don't decode, so it can be extracted properly */
221 src = iri__copychar(&bufp, src);
228 /* Following password data */
231 p->iri.password = bufp;
232 while(*src && *src != ':' && *src != '@')
234 src = iri__copychar_decode(&bufp, src, 0);
241 /* It was actually scheme:user:auth@host */
242 p->iri.user = p->iri.auth;
244 p->iri.password = bufp;
245 while(*src && *src != '@')
247 src = iri__copychar_decode(&bufp, src, 0);
265 /* user[;auth]@host[/path...] */
268 while(*src != '@' && *src != ';')
270 src = iri__copychar_decode(&bufp, src, 0);
279 while(*src && *src != '@')
281 /* Don't decode, so it can be extracted properly */
282 src = iri__copychar(&bufp, src);
292 if(NULL != p->iri.scheme)
295 for(t = p->iri.scheme; *t; t++)
303 sl = (char **) (void *) bufp;
304 bufp += (sc + 1) * sizeof(char *);
309 for(t = p->iri.scheme; *t; t++)
337 p->iri.schemelist = (const char **) sl;
338 p->iri.nschemes = sc;
343 while(*src && *src != ':' && *src != '/' && *src != '?' && *src != '#')
345 src = iri__copychar_decode(&bufp, src, 0);
354 p->iri.port = strtol(src, &endp, 10);
361 while(*src && *src != '?' && *src != '#')
363 src = iri__copychar_decode(&bufp, src, 0);
373 while(*src && *src != '#')
375 /* Don't actually decode the query itself, otherwise it
376 * can't be reliably split */
377 src = iri__copychar(&bufp, src);
385 p->iri.anchor = bufp;
388 src = iri__copychar_decode(&bufp, src, 0);
395 /* Still stuff left? It must be a path... of sorts */
398 while(*src && *src != '?' && *src != '#')
400 src = iri__copychar_decode(&bufp, src, 0);