src/res.c

   1 /* Support for Robot Exclusion Standard (RES).
   2    Copyright (C) 2001, 2006, 2007, 2008, 2009 Free Software Foundation,
   3    Inc.
   4
   5 This file is part of Wget.
   6
   7 This program is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 This program is distributed in the hope that it will be useful, but
  13 WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 /* This file implements the Robot Exclusion Standard (RES).
  32
  33    RES is a simple protocol that enables site admins to signalize to
  34    the web crawlers that certain parts of the site should not be
  35    accessed.  All the admin needs to do is create a "robots.txt" file
  36    in the web server root, and use simple commands to allow or
  37    disallow access to certain parts of the site.
  38
  39    The first specification was written by Martijn Koster in 1994, and
  40    is still available at <http://www.robotstxt.org/wc/norobots.html>.
  41    In 1996, Martijn wrote an Internet Draft specifying an improved RES
  42    specification; however, that work was apparently abandoned since
  43    the draft has expired in 1997 and hasn't been replaced since.  The
  44    draft is available at
  45    <http://www.robotstxt.org/wc/norobots-rfc.html>.
  46
  47    This file implements RES as specified by the draft.  Note that this
  48    only handles the "robots.txt" support.  The META tag that controls
  49    whether the links should be followed is handled in `html-url.c'.
  50
  51    Known deviations:
  52
  53    * The end-of-line comment recognition is more in the spirit of the
  54      Bourne Shell (as specified by RES-1994).  That means that
  55      "foo#bar" is taken literally, whereas "foo #bar" is interpreted
  56      as "foo".  The Draft apparently specifies that both should be
  57      interpreted as "foo".
  58
  59    * We don't recognize sole CR as the line ending.
  60
  61    * We don't implement expiry mechanism for /robots.txt specs.  I
  62      consider it non-necessary for a relatively short-lived
  63      application such as Wget.  Besides, it is highly questionable
  64      whether anyone deploys the recommended expiry scheme for
  65      robots.txt.
  66
  67    Entry points are functions res_parse, res_parse_from_file,
  68    res_match_path, res_register_specs, res_get_specs, and
  69    res_retrieve_file.  */
  70
  71 #include "wget.h"
  72
  73 #include <stdio.h>
  74 #include <stdlib.h>
  75 #include <string.h>
  76 #include <errno.h>
  77 #include <assert.h>
  78
  79 #include "utils.h"
  80 #include "hash.h"
  81 #include "url.h"
  82 #include "retr.h"
  83 #include "res.h"
  84
  85 #ifdef TESTING
  86 #include "test.h"
  87 #endif
  88
  89 struct path_info {
  90   char *path;
  91   bool allowedp;
  92   bool user_agent_exact_p;
  93 };
  94
  95 struct robot_specs {
  96   int count;
  97   int size;
  98   struct path_info *paths;
  99 };
 100 \f
 101 /* Parsing the robot spec. */
 102
 103 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
 104    "*".  If it is either of them, *matches is set to one.  If it is
 105    "wget", *exact_match is set to one.  */
 106
 107 static void
 108 match_user_agent (const char *agent, int length,
 109                   bool *matches, bool *exact_match)
 110 {
 111   if (length == 1 && *agent == '*')
 112     {
 113       *matches = true;
 114       *exact_match = false;
 115     }
 116   else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
 117     {
 118       *matches = true;
 119       *exact_match = true;
 120     }
 121   else
 122     {
 123       *matches = false;
 124       *exact_match = false;
 125     }
 126 }
 127
 128 /* Add a path specification between PATH_B and PATH_E as one of the
 129    paths in SPECS.  */
 130
 131 static void
 132 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
 133           bool allowedp, bool exactp)
 134 {
 135   struct path_info pp;
 136   if (path_b < path_e && *path_b == '/')
 137     /* Our path representation doesn't use a leading slash, so remove
 138        one from theirs. */
 139     ++path_b;
 140   pp.path     = strdupdelim (path_b, path_e);
 141   pp.allowedp = allowedp;
 142   pp.user_agent_exact_p = exactp;
 143   ++specs->count;
 144   if (specs->count > specs->size)
 145     {
 146       if (specs->size == 0)
 147         specs->size = 1;
 148       else
 149         specs->size <<= 1;
 150       specs->paths = xrealloc (specs->paths,
 151                                specs->size * sizeof (struct path_info));
 152     }
 153   specs->paths[specs->count - 1] = pp;
 154 }
 155
 156 /* Recreate SPECS->paths with only those paths that have
 157    user_agent_exact_p set to true.  */
 158
 159 static void
 160 prune_non_exact (struct robot_specs *specs)
 161 {
 162   struct path_info *newpaths;
 163   int i, j, cnt;
 164   cnt = 0;
 165   for (i = 0; i < specs->count; i++)
 166     if (specs->paths[i].user_agent_exact_p)
 167       ++cnt;
 168   newpaths = xnew_array (struct path_info, cnt);
 169   for (i = 0, j = 0; i < specs->count; i++)
 170     if (specs->paths[i].user_agent_exact_p)
 171       newpaths[j++] = specs->paths[i];
 172   assert (j == cnt);
 173   xfree (specs->paths);
 174   specs->paths = newpaths;
 175   specs->count = cnt;
 176   specs->size  = cnt;
 177 }
 178
 179 #define EOL(p) ((p) >= lineend)
 180
 181 #define SKIP_SPACE(p) do {              \
 182   while (!EOL (p) && c_isspace (*p))      \
 183     ++p;                                \
 184 } while (0)
 185
 186 #define FIELD_IS(string_literal)        \
 187   BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
 188
 189 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
 190    Return a specs objects ready to be fed to res_match_path.
 191
 192    The parsing itself is trivial, but creating a correct SPECS object
 193    is trickier than it seems, because RES is surprisingly byzantine if
 194    you attempt to implement it correctly.
 195
 196    A "record" is a block of one or more `User-Agent' lines followed by
 197    one or more `Allow' or `Disallow' lines.  Record is accepted by
 198    Wget if one of the `User-Agent' lines was "wget", or if the user
 199    agent line was "*".
 200
 201    After all the lines have been read, we examine whether an exact
 202    ("wget") user-agent field was specified.  If so, we delete all the
 203    lines read under "User-Agent: *" blocks because we have our own
 204    Wget-specific blocks.  This enables the admin to say:
 205
 206        User-Agent: *
 207        Disallow: /
 208
 209        User-Agent: google
 210        User-Agent: wget
 211        Disallow: /cgi-bin
 212
 213    This means that to Wget and to Google, /cgi-bin is disallowed,
 214    whereas for all other crawlers, everything is disallowed.
 215    res_parse is implemented so that the order of records doesn't
 216    matter.  In the case above, the "User-Agent: *" could have come
 217    after the other one.  */
 218
 219 struct robot_specs *
 220 res_parse (const char *source, int length)
 221 {
 222   int line_count = 1;
 223
 224   const char *p   = source;
 225   const char *end = source + length;
 226
 227   /* true if last applicable user-agent field matches Wget. */
 228   bool user_agent_applies = false;
 229
 230   /* true if last applicable user-agent field *exactly* matches
 231      Wget.  */
 232   bool user_agent_exact = false;
 233
 234   /* whether we ever encountered exact user agent. */
 235   bool found_exact = false;
 236
 237   /* count of allow/disallow lines in the current "record", i.e. after
 238      the last `user-agent' instructions.  */
 239   int record_count = 0;
 240
 241   struct robot_specs *specs = xnew0 (struct robot_specs);
 242
 243   while (1)
 244     {
 245       const char *lineend, *lineend_real;
 246       const char *field_b, *field_e;
 247       const char *value_b, *value_e;
 248
 249       if (p == end)
 250         break;
 251       lineend_real = memchr (p, '\n', end - p);
 252       if (lineend_real)
 253         ++lineend_real;
 254       else
 255         lineend_real = end;
 256       lineend = lineend_real;
 257
 258       /* Before doing anything else, check whether the line is empty
 259          or comment-only. */
 260       SKIP_SPACE (p);
 261       if (EOL (p) || *p == '#')
 262         goto next;
 263
 264       /* Make sure the end-of-line comments are respected by setting
 265          lineend to a location preceding the first comment.  Real line
 266          ending remains in lineend_real.  */
 267       for (lineend = p; lineend < lineend_real; lineend++)
 268         if ((lineend == p || c_isspace (*(lineend - 1)))
 269             && *lineend == '#')
 270           break;
 271
 272       /* Ignore trailing whitespace in the same way. */
 273       while (lineend > p && c_isspace (*(lineend - 1)))
 274         --lineend;
 275
 276       assert (!EOL (p));
 277
 278       field_b = p;
 279       while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
 280         ++p;
 281       field_e = p;
 282
 283       SKIP_SPACE (p);
 284       if (field_b == field_e || EOL (p) || *p != ':')
 285         {
 286           DEBUGP (("Ignoring malformed line %d", line_count));
 287           goto next;
 288         }
 289       ++p;                      /* skip ':' */
 290       SKIP_SPACE (p);
 291
 292       value_b = p;
 293       while (!EOL (p))
 294         ++p;
 295       value_e = p;
 296
 297       /* Finally, we have a syntactically valid line. */
 298       if (FIELD_IS ("user-agent"))
 299         {
 300           /* We have to support several cases:
 301
 302              --previous records--
 303
 304              User-Agent: foo
 305              User-Agent: Wget
 306              User-Agent: bar
 307              ... matching record ...
 308
 309              User-Agent: baz
 310              User-Agent: qux
 311              ... non-matching record ...
 312
 313              User-Agent: *
 314              ... matching record, but will be pruned later ...
 315
 316              We have to respect `User-Agent' at the beginning of each
 317              new record simply because we don't know if we're going to
 318              encounter "Wget" among the agents or not.  Hence,
 319              match_user_agent is called when record_count != 0.
 320
 321              But if record_count is 0, we have to keep calling it
 322              until it matches, and if that happens, we must not call
 323              it any more, until the next record.  Hence the other part
 324              of the condition.  */
 325           if (record_count != 0 || user_agent_applies == false)
 326             match_user_agent (value_b, value_e - value_b,
 327                               &user_agent_applies, &user_agent_exact);
 328           if (user_agent_exact)
 329             found_exact = true;
 330           record_count = 0;
 331         }
 332       else if (FIELD_IS ("allow"))
 333         {
 334           if (user_agent_applies)
 335             {
 336               add_path (specs, value_b, value_e, true, user_agent_exact);
 337             }
 338           ++record_count;
 339         }
 340       else if (FIELD_IS ("disallow"))
 341         {
 342           if (user_agent_applies)
 343             {
 344               bool allowed = false;
 345               if (value_b == value_e)
 346                 /* Empty "disallow" line means everything is *allowed*!  */
 347                 allowed = true;
 348               add_path (specs, value_b, value_e, allowed, user_agent_exact);
 349             }
 350           ++record_count;
 351         }
 352       else
 353         {
 354           DEBUGP (("Ignoring unknown field at line %d", line_count));
 355           goto next;
 356         }
 357
 358     next:
 359       p = lineend_real;
 360       ++line_count;
 361     }
 362
 363   if (found_exact)
 364     {
 365       /* We've encountered an exactly matching user-agent.  Throw out
 366          all the stuff with user-agent: *.  */
 367       prune_non_exact (specs);
 368     }
 369   else if (specs->size > specs->count)
 370     {
 371       /* add_path normally over-allocates specs->paths.  Reallocate it
 372          to the correct size in order to conserve some memory.  */
 373       specs->paths = xrealloc (specs->paths,
 374                                specs->count * sizeof (struct path_info));
 375       specs->size = specs->count;
 376     }
 377
 378   return specs;
 379 }
 380
 381 /* The same like res_parse, but first map the FILENAME into memory,
 382    and then parse it.  */
 383
 384 struct robot_specs *
 385 res_parse_from_file (const char *filename)
 386 {
 387   struct robot_specs *specs;
 388   struct file_memory *fm = read_file (filename);
 389   if (!fm)
 390     {
 391       logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
 392                  filename, strerror (errno));
 393       return NULL;
 394     }
 395   specs = res_parse (fm->content, fm->length);
 396   read_file_free (fm);
 397   return specs;
 398 }
 399
 400 static void
 401 free_specs (struct robot_specs *specs)
 402 {
 403   int i;
 404   for (i = 0; i < specs->count; i++)
 405     xfree (specs->paths[i].path);
 406   xfree_null (specs->paths);
 407   xfree (specs);
 408 }
 409 \f
 410 /* Matching of a path according to the specs. */
 411
 412 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
 413    that number is not a numerical representation of '/', decode C and
 414    advance the pointer.  */
 415
 416 #define DECODE_MAYBE(c, ptr) do {                               \
 417   if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2]))       \
 418     {                                                           \
 419       char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
 420       if (decoded != '/')                                       \
 421         {                                                       \
 422           c = decoded;                                          \
 423           ptr += 2;                                             \
 424         }                                                       \
 425     }                                                           \
 426 } while (0)
 427
 428 /* The inner matching engine: return true if RECORD_PATH matches
 429    URL_PATH.  The rules for matching are described at
 430    <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2.  */
 431
 432 static bool
 433 matches (const char *record_path, const char *url_path)
 434 {
 435   const char *rp = record_path;
 436   const char *up = url_path;
 437
 438   for (; ; ++rp, ++up)
 439     {
 440       char rc = *rp;
 441       char uc = *up;
 442       if (!rc)
 443         return true;
 444       if (!uc)
 445         return false;
 446       DECODE_MAYBE(rc, rp);
 447       DECODE_MAYBE(uc, up);
 448       if (rc != uc)
 449         return false;
 450     }
 451 }
 452
 453 /* Iterate through all paths in SPECS.  For the first one that
 454    matches, return its allow/reject status.  If none matches,
 455    retrieval is by default allowed.  */
 456
 457 bool
 458 res_match_path (const struct robot_specs *specs, const char *path)
 459 {
 460   int i;
 461   if (!specs)
 462     return true;
 463   for (i = 0; i < specs->count; i++)
 464     if (matches (specs->paths[i].path, path))
 465       {
 466         bool allowedp = specs->paths[i].allowedp;
 467         DEBUGP (("%s path %s because of rule %s.\n",
 468                  allowedp ? "Allowing" : "Rejecting",
 469                  path, quote (specs->paths[i].path)));
 470         return allowedp;
 471       }
 472   return true;
 473 }
 474 \f
 475 /* Registering the specs. */
 476
 477 static struct hash_table *registered_specs;
 478
 479 /* Stolen from cookies.c. */
 480 #define SET_HOSTPORT(host, port, result) do {           \
 481   int HP_len = strlen (host);                           \
 482   result = alloca (HP_len + 1 + numdigit (port) + 1);   \
 483   memcpy (result, host, HP_len);                        \
 484   result[HP_len] = ':';                                 \
 485   number_to_string (result + HP_len + 1, port);         \
 486 } while (0)
 487
 488 /* Register RES specs that below to server on HOST:PORT.  They will
 489    later be retrievable using res_get_specs.  */
 490
 491 void
 492 res_register_specs (const char *host, int port, struct robot_specs *specs)
 493 {
 494   struct robot_specs *old;
 495   char *hp, *hp_old;
 496   SET_HOSTPORT (host, port, hp);
 497
 498   if (!registered_specs)
 499     registered_specs = make_nocase_string_hash_table (0);
 500
 501   if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
 502     {
 503       if (old)
 504         free_specs (old);
 505       hash_table_put (registered_specs, hp_old, specs);
 506     }
 507   else
 508     {
 509       hash_table_put (registered_specs, xstrdup (hp), specs);
 510     }
 511 }
 512
 513 /* Get the specs that belong to HOST:PORT. */
 514
 515 struct robot_specs *
 516 res_get_specs (const char *host, int port)
 517 {
 518   char *hp;
 519   SET_HOSTPORT (host, port, hp);
 520   if (!registered_specs)
 521     return NULL;
 522   return hash_table_get (registered_specs, hp);
 523 }
 524 \f
 525 /* Loading the robots file.  */
 526
 527 #define RES_SPECS_LOCATION "/robots.txt"
 528
 529 /* Retrieve the robots.txt from the server root of the server that
 530    serves URL.  The file will be named according to the currently
 531    active rules, and the file name will be returned in *file.
 532
 533    Return true if robots were retrieved OK, false otherwise.  */
 534
 535 bool
 536 res_retrieve_file (const char *url, char **file, struct iri *iri)
 537 {
 538   struct iri *i = iri_new ();
 539   uerr_t err;
 540   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 541   int saved_ts_val = opt.timestamping;
 542   int saved_sp_val = opt.spider, url_err;
 543   struct url * url_parsed;
 544
 545   /* Copy server URI encoding for a possible IDNA transformation, no need to
 546      encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
 547   set_uri_encoding (i, iri->uri_encoding, false);
 548   i->utf8_encode = false;
 549
 550   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
 551   *file = NULL;
 552   opt.timestamping = false;
 553   opt.spider       = false;
 554
 555   url_parsed = url_parse (robots_url, &url_err, iri, true);
 556   if (!url_parsed)
 557     {
 558       char *error = url_error (robots_url, url_err);
 559       logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
 560       xfree (error);
 561       err = URLERROR;
 562     }
 563   else
 564     {
 565       err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
 566                           false, i, false);
 567       url_free(url_parsed);
 568     }
 569
 570   opt.timestamping = saved_ts_val;
 571   opt.spider       = saved_sp_val;
 572   xfree (robots_url);
 573   iri_free (i);
 574
 575   if (err != RETROK && *file != NULL)
 576     {
 577       /* If the file is not retrieved correctly, but retrieve_url
 578          allocated the file name, deallocate is here so that the
 579          caller doesn't have to worry about it.  */
 580       xfree (*file);
 581       *file = NULL;
 582     }
 583   return err == RETROK;
 584 }
 585 \f
 586 bool
 587 is_robots_txt_url (const char *url)
 588 {
 589   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
 590   bool ret = are_urls_equal (url, robots_url);
 591
 592   xfree (robots_url);
 593
 594   return ret;
 595 }
 596 \f
 597 void
 598 res_cleanup (void)
 599 {
 600   if (registered_specs)
 601     {
 602       hash_table_iterator iter;
 603       for (hash_table_iterate (registered_specs, &iter);
 604            hash_table_iter_next (&iter);
 605            )
 606         {
 607           xfree (iter.key);
 608           free_specs (iter.value);
 609         }
 610       hash_table_destroy (registered_specs);
 611       registered_specs = NULL;
 612     }
 613 }
 614 \f
 615 #ifdef TESTING
 616
 617 const char *
 618 test_is_robots_txt_url()
 619 {
 620   int i;
 621   struct {
 622     char *url;
 623     bool expected_result;
 624   } test_array[] = {
 625     { "http://www.yoyodyne.com/robots.txt", true },
 626     { "http://www.yoyodyne.com/somepath/", false },
 627     { "http://www.yoyodyne.com/somepath/robots.txt", false },
 628   };
 629
 630   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
 631     {
 632       mu_assert ("test_is_robots_txt_url: wrong result",
 633                  is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
 634     }
 635
 636   return NULL;
 637 }
 638
 639 #endif /* TESTING */
 640
 641 /*
 642  * vim: et ts=2 sw=2
 643  */
 644