parse +-separated scheme lists (e.g., svn+ssh);
[platform/upstream/libiri.git] / libiri / parse.c
1 /*
2  * libiri: An IRI/URI/URL parsing library
3  * @(#) $Id$
4  */
5
6 /*
7  * Copyright (c) 2005, 2008 Mo McRoberts.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the distribution.
17  * 3. The names of the author(s) of this software may not be used to endorse
18  * or promote products derived from this software without specific prior
19  * written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 
22  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
23  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
24  * AUTHORS OF THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 # include "config.h"
35 #endif
36
37 #include <stdio.h>
38
39 #include "p_libiri.h"
40
41 #undef ALIGNMENT
42 #define ALIGNMENT 8
43 #undef ALIGN
44 #define _ALIGN(x) ((((x)+(ALIGNMENT-1))&~(ALIGNMENT-1)))
45 #define ALIGN(x) (char *) _ALIGN((size_t) x)
46
47 static inline int
48 iri__hexnibble(char c)
49 {
50         if(c >= '0' && c <= '9')
51         {
52                 return c - '0';
53         }
54         if(c >= 'A' && c <= 'F')
55         {
56                 return c - 'A' + 10;
57         }
58         if(c >= 'a' && c <= 'f')
59         {
60                 return c - 'a' + 10;
61         }
62 }
63
64 static inline const char *
65 iri__copychar(char **dest, const char *src)
66 {
67         **dest = *src;
68         (*dest)++;
69         src++;
70         return src;
71 }
72
73 /* TODO: Punycode decoding for the host part */
74 static inline const char *
75 iri__copychar_decode(char **dest, const char *src, int convert_space)
76 {
77         unsigned char *p = (unsigned char *) (*dest);
78         
79         if(1 == convert_space && '+' == *src)
80         {
81                 **dest = ' ';
82         }
83         else if('%' == *src)
84         {
85                 if(0 == isxdigit(src[1]) || 0 == isxdigit(src[2]))
86                 {
87                         /* TODO: Deal with %u<nnnn> non-standard encoding - be liberal in
88                          * what you accept, etc.
89                          */
90                         **dest = '%';
91                 }
92                 else
93                 {
94                         *p = (iri__hexnibble(src[1]) << 4) | iri__hexnibble(src[2]);
95                         src += 2;
96                 }
97         }
98         else
99         {
100                 **dest = *src;
101         }
102         src++;
103         (*dest)++;
104         return src;
105 }
106
107 static inline char *
108 iri__allocbuf(const char *src, size_t *len)
109 {
110         size_t sc;
111         const char *p, *c;
112         
113         /* Calculate the size of the buffer required to hold a decoded version of
114          * src, including enough breathing space for null bytes.
115          */
116         /* XXX: This is way too much; we need to actually count it */
117         *len = (strlen(src) * 4) + 16;
118         /* Determine how much space we need for the scheme list */
119         if(NULL != (c = strchr(src, ':')))
120         {
121                 sc = 1;
122                 for(p = src; p < c; p++)
123                 {
124                         if(*p == '+')
125                         {
126                                 sc++;
127                         }
128                 }
129                 /* Ensure we can align each element on an 8-byte boundary */
130                 *len = (src - c) + 1 + sc + ((sc + 1) * (sizeof(char *) + 7));
131                 *len += (7 * 11);
132         }
133         return (char *) calloc(1, *len);
134 }
135
136 iri_t *
137 iri_parse(const char *src)
138 {
139         iri_t *p;
140         char *bufstart, *endp, *bufp, **sl;
141         const char *at, *colon, *slash, *t;
142         size_t buflen, sc, cp;
143         
144         if(NULL == (p = (iri_t *) calloc(1, sizeof(iri_t))))
145         {
146                 return NULL;
147         }
148         if(NULL == (bufstart = iri__allocbuf(src, &buflen)))
149         {
150                 free(p);
151                 return NULL;
152         }
153         p->base = bufp = bufstart;
154         p->nbytes = buflen;
155         at = strchr(src, '@');
156         slash = strchr(src, '/');
157         colon = strchr(src, ':');
158         if(slash && colon && slash < colon)
159         {
160                 /* We can disregard the colon if a slash appears before it */
161                 colon = NULL;
162         }
163         if(colon && !at)
164         {
165                 /* Definitely a scheme */
166                 bufp = ALIGN(bufp);
167                 p->iri.scheme = bufp;
168                 while(*src && *src != ':')
169                 {
170                         src = iri__copychar_decode(&bufp, src, 0);
171                 }
172                 *bufp = 0;
173                 bufp++;
174                 src++;
175                 /* src[0-1] SHOULD == '/' */
176                 if(src[0] == '/') src++;
177                 if(src[0] == '/') src++;
178         }
179         else if(colon && at && colon < at)
180         {
181                 fprintf(stderr, "Colon occurs before at\n");
182                 /* This could be scheme://user[;auth][:password]@host or [scheme:]user[;auth][:password]@host (urgh) */
183                 if(colon[1] == '/' && colon[2] == '/' && colon[3] != '/')
184                 {
185                         bufp = ALIGN(bufp);
186                         p->iri.scheme = bufp;
187                         while(*src && *src != ':')
188                         {
189                                 src = iri__copychar_decode(&bufp, src, 0);
190                         }
191                         *bufp = 0;
192                         bufp++;
193                         src++;
194                         /* src[0-1] SHOULD == '/' */
195                         for(; *src == '/'; src++);
196                         bufp = ALIGN(bufp);
197                         p->iri.user = bufp;
198                         fprintf(stderr, "Found user\n");
199                 }
200                 else
201                 {
202                         fprintf(stderr, "Matched scheme\n");
203                         bufp = ALIGN(bufp);
204                         p->iri.scheme = bufp;
205                 }
206                 while(*src && *src != ':' && *src != '@' && *src != ';')
207                 {
208                         src = iri__copychar_decode(&bufp, src, 0);
209                 }
210                 *bufp = 0;
211                 bufp++;
212                 if(*src == ';')
213                 {
214                         /* Following authentication parameters */
215                         src++;
216                         bufp = ALIGN(bufp);
217                         p->iri.auth = bufp;
218                         while(*src && *src != ':' && *src != '@')
219                         {
220                                 /* Don't decode, so it can be extracted properly */
221                                 src = iri__copychar(&bufp, src);
222                         }
223                         *bufp = 0;
224                         bufp++;
225                 }
226                 if(*src == ':')
227                 {
228                         /* Following password data */
229                         src++;
230                         bufp = ALIGN(bufp);
231                         p->iri.password = bufp;
232                         while(*src && *src != ':' && *src != '@')
233                         {
234                                 src = iri__copychar_decode(&bufp, src, 0);
235                         }
236                         *bufp = 0;
237                         bufp++;
238                         if(*src == ':')
239                         {
240                                 src++;
241                                 /* It was actually scheme:user:auth@host */
242                                 p->iri.user = p->iri.auth;
243                                 bufp = ALIGN(bufp);
244                                 p->iri.password = bufp;
245                                 while(*src && *src != '@')
246                                 {
247                                         src = iri__copychar_decode(&bufp, src, 0);
248                                 }
249                                 *bufp = 0;
250                                 bufp++;
251                         }
252                 }
253                 if(!*src)
254                 {
255                         /* No host part */
256                         return p;
257                 }
258                 if(*src == '@')
259                 {
260                         src++;
261                 }
262         }
263         else if(at)
264         {
265                 /* user[;auth]@host[/path...] */
266                 bufp = ALIGN(bufp);
267                 p->iri.user = bufp;
268                 while(*src != '@' && *src != ';')
269                 {
270                         src = iri__copychar_decode(&bufp, src, 0);
271                 }
272                 *bufp = 0;
273                 bufp++;
274                 if(*src == ';')
275                 {
276                         src++;
277                         bufp = ALIGN(bufp);
278                         p->iri.auth = bufp;
279                         while(*src && *src != '@')
280                         {
281                                 /* Don't decode, so it can be extracted properly */
282                                 src = iri__copychar(&bufp, src);
283                         }
284                         *bufp = 0;
285                         bufp++;
286                 }
287                 else
288                 {
289                         src++;
290                 }
291         }
292         if(NULL != p->iri.scheme)
293         {
294                 sc = 1;
295                 for(t = p->iri.scheme; *t; t++)
296                 {
297                         if('+' == *t)
298                         {
299                                 sc++;
300                         }
301                 }
302                 bufp = ALIGN(bufp);
303                 sl = (char **) (void *) bufp;
304                 bufp += (sc + 1) * sizeof(char *);
305                 sc = 0;
306                 cp = 0;
307                 bufp = ALIGN(bufp);
308                 sl[0] = bufp;
309                 for(t = p->iri.scheme; *t; t++)
310                 {
311                         if('+' == *t)
312                         {
313                                 if(sl[sc][0])
314                                 {
315                                         sl[sc][cp] = 0;
316                                         bufp++;
317                                         sc++;
318                                         bufp = ALIGN(bufp);
319                                         sl[sc] = bufp;
320                                         cp = 0;
321                                 }
322                         }
323                         else
324                         {
325                                 sl[sc][cp] = *t;
326                                 bufp++;
327                                 cp++;
328                         }
329                 }
330                 if(sl[sc][0])
331                 {
332                         sl[sc][cp] = 0;
333                         sc++;
334                         bufp++;
335                 }
336                 sl[sc] = NULL;
337                 p->iri.schemelist = (const char **) sl;
338                 p->iri.nschemes = sc;
339                 bufp++;
340         }
341         bufp = ALIGN(bufp);
342         p->iri.host = bufp;
343         while(*src && *src != ':' && *src != '/' && *src != '?' && *src != '#')
344         {
345                 src = iri__copychar_decode(&bufp, src, 0);
346         }
347         *bufp = 0;
348         bufp++;
349         if(*src == ':')
350         {
351                 /* Port part */
352                 src++;
353                 endp = (char *) src;
354                 p->iri.port = strtol(src, &endp, 10);
355                 src = endp;
356         }
357         if(*src == '/')
358         {
359                 bufp = ALIGN(bufp);
360                 p->iri.path = bufp;
361                 while(*src && *src != '?' && *src != '#')
362                 {
363                         src = iri__copychar_decode(&bufp, src, 0);
364                 }
365                 *bufp = 0;
366                 bufp++;
367         }
368         if(*src == '?')
369         {
370                 bufp = ALIGN(bufp);
371                 p->iri.query = bufp;
372                 src++;
373                 while(*src && *src != '#')
374                 {
375                         /* Don't actually decode the query itself, otherwise it
376                          * can't be reliably split */
377                         src = iri__copychar(&bufp, src);
378                 }
379                 *bufp = 0;
380                 bufp++;
381         }
382         if(*src == '#')
383         {
384                 bufp = ALIGN(bufp);
385                 p->iri.anchor = bufp; 
386                 while(*src)
387                 {
388                         src = iri__copychar_decode(&bufp, src, 0);
389                 }
390                 *bufp = 0;
391                 bufp++;
392         }
393         if(*src)
394         {
395                 /* Still stuff left? It must be a path... of sorts */
396                 bufp = ALIGN(bufp);
397                 p->iri.path = bufp; 
398                 while(*src && *src != '?' && *src != '#')
399                 {
400                         src = iri__copychar_decode(&bufp, src, 0);
401                 }
402                 *bufp = 0;
403                 bufp++;
404         }
405         return p;
406 }