src/tool_urlglob.c

   1 /***************************************************************************
   2  *                                  _   _ ____  _
   3  *  Project                     ___| | | |  _ \| |
   4  *                             / __| | | | |_) | |
   5  *                            | (__| |_| |  _ <| |___
   6  *                             \___|\___/|_| \_\_____|
   7  *
   8  * Copyright (C) 1998 - 2016, Daniel Stenberg, <daniel@haxx.se>, et al.
   9  *
  10  * This software is licensed as described in the file COPYING, which
  11  * you should have received as part of this distribution. The terms
  12  * are also available at https://curl.haxx.se/docs/copyright.html.
  13  *
  14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
  15  * copies of the Software, and permit persons to whom the Software is
  16  * furnished to do so, under the terms of the COPYING file.
  17  *
  18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  19  * KIND, either express or implied.
  20  *
  21  ***************************************************************************/
  22 #include "tool_setup.h"
  23
  24 #define ENABLE_CURLX_PRINTF
  25 /* use our own printf() functions */
  26 #include "curlx.h"
  27 #include "tool_cfgable.h"
  28 #include "tool_doswin.h"
  29 #include "tool_urlglob.h"
  30 #include "tool_vms.h"
  31
  32 #include "memdebug.h" /* keep this as LAST include */
  33
  34 #define GLOBERROR(string, column, code) \
  35   glob->error = string, glob->pos = column, code
  36
  37 void glob_cleanup(URLGlob* glob);
  38
  39 static CURLcode glob_fixed(URLGlob *glob, char *fixed, size_t len)
  40 {
  41   URLPattern *pat = &glob->pattern[glob->size];
  42   pat->type = UPTSet;
  43   pat->content.Set.size = 1;
  44   pat->content.Set.ptr_s = 0;
  45   pat->globindex = -1;
  46
  47   pat->content.Set.elements = malloc(sizeof(char*));
  48
  49   if(!pat->content.Set.elements)
  50     return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
  51
  52   pat->content.Set.elements[0] = malloc(len+1);
  53   if(!pat->content.Set.elements[0])
  54     return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
  55
  56   memcpy(pat->content.Set.elements[0], fixed, len);
  57   pat->content.Set.elements[0][len] = 0;
  58
  59   return CURLE_OK;
  60 }
  61
  62 /* multiply
  63  *
  64  * Multiplies and checks for overflow.
  65  */
  66 static int multiply(unsigned long *amount, long with)
  67 {
  68   unsigned long sum = *amount * with;
  69   if(sum/with != *amount)
  70     return 1; /* didn't fit, bail out */
  71   *amount = sum;
  72   return 0;
  73 }
  74
  75 static CURLcode glob_set(URLGlob *glob, char **patternp,
  76                          size_t *posp, unsigned long *amount,
  77                          int globindex)
  78 {
  79   /* processes a set expression with the point behind the opening '{'
  80      ','-separated elements are collected until the next closing '}'
  81   */
  82   URLPattern *pat;
  83   bool done = FALSE;
  84   char *buf = glob->glob_buffer;
  85   char *pattern = *patternp;
  86   char *opattern = pattern;
  87   size_t opos = *posp-1;
  88
  89   pat = &glob->pattern[glob->size];
  90   /* patterns 0,1,2,... correspond to size=1,3,5,... */
  91   pat->type = UPTSet;
  92   pat->content.Set.size = 0;
  93   pat->content.Set.ptr_s = 0;
  94   pat->content.Set.elements = NULL;
  95   pat->globindex = globindex;
  96
  97   while(!done) {
  98     switch (*pattern) {
  99     case '\0':                  /* URL ended while set was still open */
 100       return GLOBERROR("unmatched brace", opos, CURLE_URL_MALFORMAT);
 101
 102     case '{':
 103     case '[':                   /* no nested expressions at this time */
 104       return GLOBERROR("nested brace", *posp, CURLE_URL_MALFORMAT);
 105
 106     case '}':                           /* set element completed */
 107       if(opattern == pattern)
 108         return GLOBERROR("empty string within braces", *posp,
 109                          CURLE_URL_MALFORMAT);
 110
 111       /* add 1 to size since it'll be incremented below */
 112       if(multiply(amount, pat->content.Set.size+1))
 113         return GLOBERROR("range overflow", 0, CURLE_URL_MALFORMAT);
 114
 115       /* fall-through */
 116     case ',':
 117
 118       *buf = '\0';
 119       if(pat->content.Set.elements) {
 120         char **new_arr = realloc(pat->content.Set.elements,
 121                                  (pat->content.Set.size + 1) * sizeof(char*));
 122         if(!new_arr)
 123           return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
 124
 125         pat->content.Set.elements = new_arr;
 126       }
 127       else
 128         pat->content.Set.elements = malloc(sizeof(char*));
 129
 130       if(!pat->content.Set.elements)
 131         return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
 132
 133       pat->content.Set.elements[pat->content.Set.size] =
 134         strdup(glob->glob_buffer);
 135       if(!pat->content.Set.elements[pat->content.Set.size])
 136         return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
 137       ++pat->content.Set.size;
 138
 139       if(*pattern == '}') {
 140         pattern++; /* pass the closing brace */
 141         done = TRUE;
 142         continue;
 143       }
 144
 145       buf = glob->glob_buffer;
 146       ++pattern;
 147       ++(*posp);
 148       break;
 149
 150     case ']':                           /* illegal closing bracket */
 151       return GLOBERROR("unexpected close bracket", *posp, CURLE_URL_MALFORMAT);
 152
 153     case '\\':                          /* escaped character, skip '\' */
 154       if(pattern[1]) {
 155         ++pattern;
 156         ++(*posp);
 157       }
 158       /* intentional fallthrough */
 159     default:
 160       *buf++ = *pattern++;              /* copy character to set element */
 161       ++(*posp);
 162     }
 163   }
 164
 165   *patternp = pattern; /* return with the new position */
 166   return CURLE_OK;
 167 }
 168
 169 static CURLcode glob_range(URLGlob *glob, char **patternp,
 170                            size_t *posp, unsigned long *amount,
 171                            int globindex)
 172 {
 173   /* processes a range expression with the point behind the opening '['
 174      - char range: e.g. "a-z]", "B-Q]"
 175      - num range: e.g. "0-9]", "17-2000]"
 176      - num range with leading zeros: e.g. "001-999]"
 177      expression is checked for well-formedness and collected until the next ']'
 178   */
 179   URLPattern *pat;
 180   int rc;
 181   char *pattern = *patternp;
 182   char *c;
 183
 184   pat = &glob->pattern[glob->size];
 185   pat->globindex = globindex;
 186
 187   if(ISALPHA(*pattern)) {
 188     /* character range detected */
 189     char min_c;
 190     char max_c;
 191     int step=1;
 192
 193     pat->type = UPTCharRange;
 194
 195     rc = sscanf(pattern, "%c-%c", &min_c, &max_c);
 196
 197     if((rc == 2) && (pattern[3] == ':')) {
 198       char *endp;
 199       unsigned long lstep;
 200       errno = 0;
 201       lstep = strtoul(&pattern[4], &endp, 10);
 202       if(errno || (*endp != ']'))
 203         step = -1;
 204       else {
 205         pattern = endp+1;
 206         step = (int)lstep;
 207         if(step > (max_c - min_c))
 208           step = -1;
 209       }
 210     }
 211     else
 212       pattern += 4;
 213
 214     *posp += (pattern - *patternp);
 215
 216     if((rc != 2) || (min_c >= max_c) || ((max_c - min_c) > ('z' - 'a')) ||
 217        (step <= 0) )
 218       /* the pattern is not well-formed */
 219       return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT);
 220
 221     /* if there was a ":[num]" thing, use that as step or else use 1 */
 222     pat->content.CharRange.step = step;
 223     pat->content.CharRange.ptr_c = pat->content.CharRange.min_c = min_c;
 224     pat->content.CharRange.max_c = max_c;
 225
 226     if(multiply(amount, (pat->content.CharRange.max_c -
 227                           pat->content.CharRange.min_c) /
 228                          pat->content.CharRange.step + 1) )
 229       return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT);
 230   }
 231   else if(ISDIGIT(*pattern)) {
 232     /* numeric range detected */
 233     unsigned long min_n;
 234     unsigned long max_n = 0;
 235     unsigned long step_n = 0;
 236     char *endp;
 237
 238     pat->type = UPTNumRange;
 239     pat->content.NumRange.padlength = 0;
 240
 241     if(*pattern == '0') {
 242       /* leading zero specified, count them! */
 243       c = pattern;
 244       while(ISDIGIT(*c)) {
 245         c++;
 246         ++pat->content.NumRange.padlength; /* padding length is set for all
 247                                               instances of this pattern */
 248       }
 249     }
 250
 251     errno = 0;
 252     min_n = strtoul(pattern, &endp, 10);
 253     if(errno || (endp == pattern))
 254       endp=NULL;
 255     else {
 256       if(*endp != '-')
 257         endp = NULL;
 258       else {
 259         pattern = endp+1;
 260         errno = 0;
 261         max_n = strtoul(pattern, &endp, 10);
 262         if(errno || (*endp == ':')) {
 263           pattern = endp+1;
 264           errno = 0;
 265           step_n = strtoul(pattern, &endp, 10);
 266           if(errno)
 267             /* over/underflow situation */
 268             endp = NULL;
 269         }
 270         else
 271           step_n = 1;
 272         if(endp && (*endp == ']')) {
 273           pattern= endp+1;
 274         }
 275         else
 276           endp = NULL;
 277       }
 278     }
 279
 280     *posp += (pattern - *patternp);
 281
 282     if(!endp || (min_n > max_n) || (step_n > (max_n - min_n)) || !step_n)
 283       /* the pattern is not well-formed */
 284       return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT);
 285
 286     /* typecasting to ints are fine here since we make sure above that we
 287        are within 31 bits */
 288     pat->content.NumRange.ptr_n = pat->content.NumRange.min_n = min_n;
 289     pat->content.NumRange.max_n = max_n;
 290     pat->content.NumRange.step = step_n;
 291
 292     if(multiply(amount, (pat->content.NumRange.max_n -
 293                          pat->content.NumRange.min_n) /
 294                         pat->content.NumRange.step + 1) )
 295       return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT);
 296   }
 297   else
 298     return GLOBERROR("bad range specification", *posp, CURLE_URL_MALFORMAT);
 299
 300   *patternp = pattern;
 301   return CURLE_OK;
 302 }
 303
 304 static bool peek_ipv6(const char *str, size_t *skip)
 305 {
 306   /*
 307    * Scan for a potential IPv6 literal.
 308    * - Valid globs contain a hyphen and <= 1 colon.
 309    * - IPv6 literals contain no hyphens and >= 2 colons.
 310    */
 311   size_t i = 0;
 312   size_t colons = 0;
 313   if(str[i++] != '[') {
 314     return FALSE;
 315   }
 316   for(;;) {
 317     const char c = str[i++];
 318     if(ISALNUM(c) || c == '.' || c == '%') {
 319       /* ok */
 320     }
 321     else if(c == ':') {
 322       colons++;
 323     }
 324     else if(c == ']') {
 325       *skip = i;
 326       return colons >= 2 ? TRUE : FALSE;
 327     }
 328     else {
 329       return FALSE;
 330     }
 331   }
 332 }
 333
 334 static CURLcode glob_parse(URLGlob *glob, char *pattern,
 335                            size_t pos, unsigned long *amount)
 336 {
 337   /* processes a literal string component of a URL
 338      special characters '{' and '[' branch to set/range processing functions
 339    */
 340   CURLcode res = CURLE_OK;
 341   int globindex = 0; /* count "actual" globs */
 342
 343   *amount = 1;
 344
 345   while(*pattern && !res) {
 346     char *buf = glob->glob_buffer;
 347     size_t sublen = 0;
 348     while(*pattern && *pattern != '{') {
 349       if(*pattern == '[') {
 350         /* Skip over potential IPv6 literals. */
 351         size_t skip;
 352         if(peek_ipv6(pattern, &skip)) {
 353           memcpy(buf, pattern, skip);
 354           buf += skip;
 355           pattern += skip;
 356           sublen += skip;
 357           continue;
 358         }
 359         break;
 360       }
 361       if(*pattern == '}' || *pattern == ']')
 362         return GLOBERROR("unmatched close brace/bracket", pos,
 363                          CURLE_URL_MALFORMAT);
 364
 365       /* only allow \ to escape known "special letters" */
 366       if(*pattern == '\\' &&
 367          (*(pattern+1) == '{' || *(pattern+1) == '[' ||
 368           *(pattern+1) == '}' || *(pattern+1) == ']') ) {
 369
 370         /* escape character, skip '\' */
 371         ++pattern;
 372         ++pos;
 373       }
 374       *buf++ = *pattern++; /* copy character to literal */
 375       ++pos;
 376       sublen++;
 377     }
 378     if(sublen) {
 379       /* we got a literal string, add it as a single-item list */
 380       *buf = '\0';
 381       res = glob_fixed(glob, glob->glob_buffer, sublen);
 382     }
 383     else {
 384       switch (*pattern) {
 385       case '\0': /* done  */
 386         break;
 387
 388       case '{':
 389         /* process set pattern */
 390         pattern++;
 391         pos++;
 392         res = glob_set(glob, &pattern, &pos, amount, globindex++);
 393         break;
 394
 395       case '[':
 396         /* process range pattern */
 397         pattern++;
 398         pos++;
 399         res = glob_range(glob, &pattern, &pos, amount, globindex++);
 400         break;
 401       }
 402     }
 403
 404     if(++glob->size >= GLOB_PATTERN_NUM)
 405       return GLOBERROR("too many globs", pos, CURLE_URL_MALFORMAT);
 406   }
 407   return res;
 408 }
 409
 410 CURLcode glob_url(URLGlob** glob, char* url, unsigned long *urlnum,
 411                   FILE *error)
 412 {
 413   /*
 414    * We can deal with any-size, just make a buffer with the same length
 415    * as the specified URL!
 416    */
 417   URLGlob *glob_expand;
 418   unsigned long amount = 0;
 419   char *glob_buffer;
 420   CURLcode res;
 421
 422   *glob = NULL;
 423
 424   glob_buffer = malloc(strlen(url) + 1);
 425   if(!glob_buffer)
 426     return CURLE_OUT_OF_MEMORY;
 427
 428   glob_expand = calloc(1, sizeof(URLGlob));
 429   if(!glob_expand) {
 430     Curl_safefree(glob_buffer);
 431     return CURLE_OUT_OF_MEMORY;
 432   }
 433   glob_expand->urllen = strlen(url);
 434   glob_expand->glob_buffer = glob_buffer;
 435
 436   res = glob_parse(glob_expand, url, 1, &amount);
 437   if(!res)
 438     *urlnum = amount;
 439   else {
 440     if(error && glob_expand->error) {
 441       char text[128];
 442       const char *t;
 443       if(glob_expand->pos) {
 444         snprintf(text, sizeof(text), "%s in column %zu", glob_expand->error,
 445                  glob_expand->pos);
 446         t = text;
 447       }
 448       else
 449         t = glob_expand->error;
 450
 451       /* send error description to the error-stream */
 452       fprintf(error, "curl: (%d) [globbing] %s\n", res, t);
 453     }
 454     /* it failed, we cleanup */
 455     glob_cleanup(glob_expand);
 456     *urlnum = 1;
 457     return res;
 458   }
 459
 460   *glob = glob_expand;
 461   return CURLE_OK;
 462 }
 463
 464 void glob_cleanup(URLGlob* glob)
 465 {
 466   size_t i;
 467   int elem;
 468
 469   for(i = 0; i < glob->size; i++) {
 470     if((glob->pattern[i].type == UPTSet) &&
 471        (glob->pattern[i].content.Set.elements)) {
 472       for(elem = glob->pattern[i].content.Set.size - 1;
 473           elem >= 0;
 474           --elem) {
 475         Curl_safefree(glob->pattern[i].content.Set.elements[elem]);
 476       }
 477       Curl_safefree(glob->pattern[i].content.Set.elements);
 478     }
 479   }
 480   Curl_safefree(glob->glob_buffer);
 481   Curl_safefree(glob);
 482 }
 483
 484 CURLcode glob_next_url(char **globbed, URLGlob *glob)
 485 {
 486   URLPattern *pat;
 487   size_t i;
 488   size_t len;
 489   size_t buflen = glob->urllen + 1;
 490   char *buf = glob->glob_buffer;
 491
 492   *globbed = NULL;
 493
 494   if(!glob->beenhere)
 495     glob->beenhere = 1;
 496   else {
 497     bool carry = TRUE;
 498
 499     /* implement a counter over the index ranges of all patterns, starting
 500        with the rightmost pattern */
 501     for(i = 0; carry && (i < glob->size); i++) {
 502       carry = FALSE;
 503       pat = &glob->pattern[glob->size - 1 - i];
 504       switch (pat->type) {
 505       case UPTSet:
 506         if((pat->content.Set.elements) &&
 507            (++pat->content.Set.ptr_s == pat->content.Set.size)) {
 508           pat->content.Set.ptr_s = 0;
 509           carry = TRUE;
 510         }
 511         break;
 512       case UPTCharRange:
 513         pat->content.CharRange.ptr_c =
 514           (char)(pat->content.CharRange.step +
 515                  (int)((unsigned char)pat->content.CharRange.ptr_c));
 516         if(pat->content.CharRange.ptr_c > pat->content.CharRange.max_c) {
 517           pat->content.CharRange.ptr_c = pat->content.CharRange.min_c;
 518           carry = TRUE;
 519         }
 520         break;
 521       case UPTNumRange:
 522         pat->content.NumRange.ptr_n += pat->content.NumRange.step;
 523         if(pat->content.NumRange.ptr_n > pat->content.NumRange.max_n) {
 524           pat->content.NumRange.ptr_n = pat->content.NumRange.min_n;
 525           carry = TRUE;
 526         }
 527         break;
 528       default:
 529         printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
 530         return CURLE_FAILED_INIT;
 531       }
 532     }
 533     if(carry) {         /* first pattern ptr has run into overflow, done! */
 534       /* TODO: verify if this should actally return CURLE_OK. */
 535       return CURLE_OK; /* CURLE_OK to match previous behavior */
 536     }
 537   }
 538
 539   for(i = 0; i < glob->size; ++i) {
 540     pat = &glob->pattern[i];
 541     switch(pat->type) {
 542     case UPTSet:
 543       if(pat->content.Set.elements) {
 544         len = strlen(pat->content.Set.elements[pat->content.Set.ptr_s]);
 545         snprintf(buf, buflen, "%s",
 546                  pat->content.Set.elements[pat->content.Set.ptr_s]);
 547         buf += len;
 548         buflen -= len;
 549       }
 550       break;
 551     case UPTCharRange:
 552       *buf++ = pat->content.CharRange.ptr_c;
 553       break;
 554     case UPTNumRange:
 555       len = snprintf(buf, buflen, "%0*ld",
 556                      pat->content.NumRange.padlength,
 557                      pat->content.NumRange.ptr_n);
 558       buf += len;
 559       buflen -= len;
 560       break;
 561     default:
 562       printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
 563       return CURLE_FAILED_INIT;
 564     }
 565   }
 566   *buf = '\0';
 567
 568   *globbed = strdup(glob->glob_buffer);
 569   if(!*globbed)
 570     return CURLE_OUT_OF_MEMORY;
 571
 572   return CURLE_OK;
 573 }
 574
 575 CURLcode glob_match_url(char **result, char *filename, URLGlob *glob)
 576 {
 577   char *target;
 578   size_t allocsize;
 579   char numbuf[18];
 580   char *appendthis = NULL;
 581   size_t appendlen = 0;
 582   size_t stringlen = 0;
 583
 584   *result = NULL;
 585
 586   /* We cannot use the glob_buffer for storage here since the filename may
 587    * be longer than the URL we use. We allocate a good start size, then
 588    * we need to realloc in case of need.
 589    */
 590   allocsize = strlen(filename) + 1; /* make it at least one byte to store the
 591                                        trailing zero */
 592   target = malloc(allocsize);
 593   if(!target)
 594     return CURLE_OUT_OF_MEMORY;
 595
 596   while(*filename) {
 597     if(*filename == '#' && ISDIGIT(filename[1])) {
 598       unsigned long i;
 599       char *ptr = filename;
 600       unsigned long num = strtoul(&filename[1], &filename, 10);
 601       URLPattern *pat =NULL;
 602
 603       if(num < glob->size) {
 604         num--; /* make it zero based */
 605         /* find the correct glob entry */
 606         for(i=0; i<glob->size; i++) {
 607           if(glob->pattern[i].globindex == (int)num) {
 608             pat = &glob->pattern[i];
 609             break;
 610           }
 611         }
 612       }
 613
 614       if(pat) {
 615         switch (pat->type) {
 616         case UPTSet:
 617           if(pat->content.Set.elements) {
 618             appendthis = pat->content.Set.elements[pat->content.Set.ptr_s];
 619             appendlen =
 620               strlen(pat->content.Set.elements[pat->content.Set.ptr_s]);
 621           }
 622           break;
 623         case UPTCharRange:
 624           numbuf[0] = pat->content.CharRange.ptr_c;
 625           numbuf[1] = 0;
 626           appendthis = numbuf;
 627           appendlen = 1;
 628           break;
 629         case UPTNumRange:
 630           snprintf(numbuf, sizeof(numbuf), "%0*d",
 631                    pat->content.NumRange.padlength,
 632                    pat->content.NumRange.ptr_n);
 633           appendthis = numbuf;
 634           appendlen = strlen(numbuf);
 635           break;
 636         default:
 637           fprintf(stderr, "internal error: invalid pattern type (%d)\n",
 638                   (int)pat->type);
 639           Curl_safefree(target);
 640           return CURLE_FAILED_INIT;
 641         }
 642       }
 643       else {
 644         /* #[num] out of range, use the #[num] in the output */
 645         filename = ptr;
 646         appendthis = filename++;
 647         appendlen = 1;
 648       }
 649     }
 650     else {
 651       appendthis = filename++;
 652       appendlen = 1;
 653     }
 654     if(appendlen + stringlen >= allocsize) {
 655       char *newstr;
 656       /* we append a single byte to allow for the trailing byte to be appended
 657          at the end of this function outside the while() loop */
 658       allocsize = (appendlen + stringlen) * 2;
 659       newstr = realloc(target, allocsize + 1);
 660       if(!newstr) {
 661         Curl_safefree(target);
 662         return CURLE_OUT_OF_MEMORY;
 663       }
 664       target = newstr;
 665     }
 666     memcpy(&target[stringlen], appendthis, appendlen);
 667     stringlen += appendlen;
 668   }
 669   target[stringlen]= '\0';
 670
 671 #if defined(MSDOS) || defined(WIN32)
 672   {
 673     char *sanitized;
 674     SANITIZEcode sc = sanitize_file_name(&sanitized, target,
 675                                          (SANITIZE_ALLOW_PATH |
 676                                           SANITIZE_ALLOW_RESERVED));
 677     Curl_safefree(target);
 678     if(sc)
 679       return CURLE_URL_MALFORMAT;
 680     target = sanitized;
 681   }
 682 #endif /* MSDOS || WIN32 */
 683
 684   *result = target;
 685   return CURLE_OK;
 686 }