1 /* Support for Robot Exclusion Standard (RES).
2 Copyright (C) 2001, 2006, 2007, 2008, 2009 Free Software Foundation,
5 This file is part of Wget.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
31 /* This file implements the Robot Exclusion Standard (RES).
33 RES is a simple protocol that enables site admins to signalize to
34 the web crawlers that certain parts of the site should not be
35 accessed. All the admin needs to do is create a "robots.txt" file
36 in the web server root, and use simple commands to allow or
37 disallow access to certain parts of the site.
39 The first specification was written by Martijn Koster in 1994, and
40 is still available at <http://www.robotstxt.org/wc/norobots.html>.
41 In 1996, Martijn wrote an Internet Draft specifying an improved RES
42 specification; however, that work was apparently abandoned since
43 the draft has expired in 1997 and hasn't been replaced since. The
45 <http://www.robotstxt.org/wc/norobots-rfc.html>.
47 This file implements RES as specified by the draft. Note that this
48 only handles the "robots.txt" support. The META tag that controls
49 whether the links should be followed is handled in `html-url.c'.
53 * The end-of-line comment recognition is more in the spirit of the
54 Bourne Shell (as specified by RES-1994). That means that
55 "foo#bar" is taken literally, whereas "foo #bar" is interpreted
56 as "foo". The Draft apparently specifies that both should be
59 * We don't recognize sole CR as the line ending.
61 * We don't implement expiry mechanism for /robots.txt specs. I
62 consider it non-necessary for a relatively short-lived
63 application such as Wget. Besides, it is highly questionable
64 whether anyone deploys the recommended expiry scheme for
67 Entry points are functions res_parse, res_parse_from_file,
68 res_match_path, res_register_specs, res_get_specs, and
92 bool user_agent_exact_p;
98 struct path_info *paths;
101 /* Parsing the robot spec. */
103 /* Check whether AGENT (a string of length LENGTH) equals "wget" or
104 "*". If it is either of them, *matches is set to one. If it is
105 "wget", *exact_match is set to one. */
108 match_user_agent (const char *agent, int length,
109 bool *matches, bool *exact_match)
111 if (length == 1 && *agent == '*')
114 *exact_match = false;
116 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
124 *exact_match = false;
128 /* Add a path specification between PATH_B and PATH_E as one of the
132 add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
133 bool allowedp, bool exactp)
136 if (path_b < path_e && *path_b == '/')
137 /* Our path representation doesn't use a leading slash, so remove
140 pp.path = strdupdelim (path_b, path_e);
141 pp.allowedp = allowedp;
142 pp.user_agent_exact_p = exactp;
144 if (specs->count > specs->size)
146 if (specs->size == 0)
150 specs->paths = xrealloc (specs->paths,
151 specs->size * sizeof (struct path_info));
153 specs->paths[specs->count - 1] = pp;
156 /* Recreate SPECS->paths with only those paths that have
157 user_agent_exact_p set to true. */
160 prune_non_exact (struct robot_specs *specs)
162 struct path_info *newpaths;
165 for (i = 0; i < specs->count; i++)
166 if (specs->paths[i].user_agent_exact_p)
168 newpaths = xnew_array (struct path_info, cnt);
169 for (i = 0, j = 0; i < specs->count; i++)
170 if (specs->paths[i].user_agent_exact_p)
171 newpaths[j++] = specs->paths[i];
173 xfree (specs->paths);
174 specs->paths = newpaths;
179 #define EOL(p) ((p) >= lineend)
181 #define SKIP_SPACE(p) do { \
182 while (!EOL (p) && c_isspace (*p)) \
186 #define FIELD_IS(string_literal) \
187 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
189 /* Parse textual RES specs beginning with SOURCE of length LENGTH.
190 Return a specs objects ready to be fed to res_match_path.
192 The parsing itself is trivial, but creating a correct SPECS object
193 is trickier than it seems, because RES is surprisingly byzantine if
194 you attempt to implement it correctly.
196 A "record" is a block of one or more `User-Agent' lines followed by
197 one or more `Allow' or `Disallow' lines. Record is accepted by
198 Wget if one of the `User-Agent' lines was "wget", or if the user
201 After all the lines have been read, we examine whether an exact
202 ("wget") user-agent field was specified. If so, we delete all the
203 lines read under "User-Agent: *" blocks because we have our own
204 Wget-specific blocks. This enables the admin to say:
213 This means that to Wget and to Google, /cgi-bin is disallowed,
214 whereas for all other crawlers, everything is disallowed.
215 res_parse is implemented so that the order of records doesn't
216 matter. In the case above, the "User-Agent: *" could have come
217 after the other one. */
220 res_parse (const char *source, int length)
224 const char *p = source;
225 const char *end = source + length;
227 /* true if last applicable user-agent field matches Wget. */
228 bool user_agent_applies = false;
230 /* true if last applicable user-agent field *exactly* matches
232 bool user_agent_exact = false;
234 /* whether we ever encountered exact user agent. */
235 bool found_exact = false;
237 /* count of allow/disallow lines in the current "record", i.e. after
238 the last `user-agent' instructions. */
239 int record_count = 0;
241 struct robot_specs *specs = xnew0 (struct robot_specs);
245 const char *lineend, *lineend_real;
246 const char *field_b, *field_e;
247 const char *value_b, *value_e;
251 lineend_real = memchr (p, '\n', end - p);
256 lineend = lineend_real;
258 /* Before doing anything else, check whether the line is empty
261 if (EOL (p) || *p == '#')
264 /* Make sure the end-of-line comments are respected by setting
265 lineend to a location preceding the first comment. Real line
266 ending remains in lineend_real. */
267 for (lineend = p; lineend < lineend_real; lineend++)
268 if ((lineend == p || c_isspace (*(lineend - 1)))
272 /* Ignore trailing whitespace in the same way. */
273 while (lineend > p && c_isspace (*(lineend - 1)))
279 while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
284 if (field_b == field_e || EOL (p) || *p != ':')
286 DEBUGP (("Ignoring malformed line %d", line_count));
297 /* Finally, we have a syntactically valid line. */
298 if (FIELD_IS ("user-agent"))
300 /* We have to support several cases:
307 ... matching record ...
311 ... non-matching record ...
314 ... matching record, but will be pruned later ...
316 We have to respect `User-Agent' at the beginning of each
317 new record simply because we don't know if we're going to
318 encounter "Wget" among the agents or not. Hence,
319 match_user_agent is called when record_count != 0.
321 But if record_count is 0, we have to keep calling it
322 until it matches, and if that happens, we must not call
323 it any more, until the next record. Hence the other part
325 if (record_count != 0 || user_agent_applies == false)
326 match_user_agent (value_b, value_e - value_b,
327 &user_agent_applies, &user_agent_exact);
328 if (user_agent_exact)
332 else if (FIELD_IS ("allow"))
334 if (user_agent_applies)
336 add_path (specs, value_b, value_e, true, user_agent_exact);
340 else if (FIELD_IS ("disallow"))
342 if (user_agent_applies)
344 bool allowed = false;
345 if (value_b == value_e)
346 /* Empty "disallow" line means everything is *allowed*! */
348 add_path (specs, value_b, value_e, allowed, user_agent_exact);
354 DEBUGP (("Ignoring unknown field at line %d", line_count));
365 /* We've encountered an exactly matching user-agent. Throw out
366 all the stuff with user-agent: *. */
367 prune_non_exact (specs);
369 else if (specs->size > specs->count)
371 /* add_path normally over-allocates specs->paths. Reallocate it
372 to the correct size in order to conserve some memory. */
373 specs->paths = xrealloc (specs->paths,
374 specs->count * sizeof (struct path_info));
375 specs->size = specs->count;
381 /* The same like res_parse, but first map the FILENAME into memory,
382 and then parse it. */
385 res_parse_from_file (const char *filename)
387 struct robot_specs *specs;
388 struct file_memory *fm = read_file (filename);
391 logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
392 filename, strerror (errno));
395 specs = res_parse (fm->content, fm->length);
401 free_specs (struct robot_specs *specs)
404 for (i = 0; i < specs->count; i++)
405 xfree (specs->paths[i].path);
406 xfree_null (specs->paths);
410 /* Matching of a path according to the specs. */
412 /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
413 that number is not a numerical representation of '/', decode C and
414 advance the pointer. */
416 #define DECODE_MAYBE(c, ptr) do { \
417 if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \
419 char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
420 if (decoded != '/') \
428 /* The inner matching engine: return true if RECORD_PATH matches
429 URL_PATH. The rules for matching are described at
430 <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
433 matches (const char *record_path, const char *url_path)
435 const char *rp = record_path;
436 const char *up = url_path;
446 DECODE_MAYBE(rc, rp);
447 DECODE_MAYBE(uc, up);
453 /* Iterate through all paths in SPECS. For the first one that
454 matches, return its allow/reject status. If none matches,
455 retrieval is by default allowed. */
458 res_match_path (const struct robot_specs *specs, const char *path)
463 for (i = 0; i < specs->count; i++)
464 if (matches (specs->paths[i].path, path))
466 bool allowedp = specs->paths[i].allowedp;
467 DEBUGP (("%s path %s because of rule %s.\n",
468 allowedp ? "Allowing" : "Rejecting",
469 path, quote (specs->paths[i].path)));
475 /* Registering the specs. */
477 static struct hash_table *registered_specs;
479 /* Stolen from cookies.c. */
480 #define SET_HOSTPORT(host, port, result) do { \
481 int HP_len = strlen (host); \
482 result = alloca (HP_len + 1 + numdigit (port) + 1); \
483 memcpy (result, host, HP_len); \
484 result[HP_len] = ':'; \
485 number_to_string (result + HP_len + 1, port); \
488 /* Register RES specs that below to server on HOST:PORT. They will
489 later be retrievable using res_get_specs. */
492 res_register_specs (const char *host, int port, struct robot_specs *specs)
494 struct robot_specs *old;
496 SET_HOSTPORT (host, port, hp);
498 if (!registered_specs)
499 registered_specs = make_nocase_string_hash_table (0);
501 if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
505 hash_table_put (registered_specs, hp_old, specs);
509 hash_table_put (registered_specs, xstrdup (hp), specs);
513 /* Get the specs that belong to HOST:PORT. */
516 res_get_specs (const char *host, int port)
519 SET_HOSTPORT (host, port, hp);
520 if (!registered_specs)
522 return hash_table_get (registered_specs, hp);
525 /* Loading the robots file. */
527 #define RES_SPECS_LOCATION "/robots.txt"
529 /* Retrieve the robots.txt from the server root of the server that
530 serves URL. The file will be named according to the currently
531 active rules, and the file name will be returned in *file.
533 Return true if robots were retrieved OK, false otherwise. */
536 res_retrieve_file (const char *url, char **file, struct iri *iri)
538 struct iri *i = iri_new ();
540 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
541 int saved_ts_val = opt.timestamping;
542 int saved_sp_val = opt.spider, url_err;
543 struct url * url_parsed;
545 /* Copy server URI encoding for a possible IDNA transformation, no need to
546 encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
547 set_uri_encoding (i, iri->uri_encoding, false);
548 i->utf8_encode = false;
550 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
552 opt.timestamping = false;
555 url_parsed = url_parse (robots_url, &url_err, iri, true);
558 char *error = url_error (robots_url, url_err);
559 logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
565 err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
567 url_free(url_parsed);
570 opt.timestamping = saved_ts_val;
571 opt.spider = saved_sp_val;
575 if (err != RETROK && *file != NULL)
577 /* If the file is not retrieved correctly, but retrieve_url
578 allocated the file name, deallocate is here so that the
579 caller doesn't have to worry about it. */
583 return err == RETROK;
587 is_robots_txt_url (const char *url)
589 char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
590 bool ret = are_urls_equal (url, robots_url);
600 if (registered_specs)
602 hash_table_iterator iter;
603 for (hash_table_iterate (registered_specs, &iter);
604 hash_table_iter_next (&iter);
608 free_specs (iter.value);
610 hash_table_destroy (registered_specs);
611 registered_specs = NULL;
618 test_is_robots_txt_url()
623 bool expected_result;
625 { "http://www.yoyodyne.com/robots.txt", true },
626 { "http://www.yoyodyne.com/somepath/", false },
627 { "http://www.yoyodyne.com/somepath/robots.txt", false },
630 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
632 mu_assert ("test_is_robots_txt_url: wrong result",
633 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);