Close #954 URL parsing/formatting corrections

author isaacs <i@izs.me>

Wed, 20 Apr 2011 22:44:34 +0000 (15:44 -0700)

committer isaacs <i@izs.me>

Wed, 20 Apr 2011 22:44:34 +0000 (15:44 -0700)
author isaacs <i@izs.me>
Wed, 20 Apr 2011 22:44:34 +0000 (15:44 -0700)
committer isaacs <i@izs.me>
Wed, 20 Apr 2011 22:44:34 +0000 (15:44 -0700)
diff --git a/lib/url.js b/lib/url.js

index cbea2f0..99a0e67 100644 (file)
--- a/lib/url.js
+++ b/lib/url.js
@@ -24,25 +24,40 @@ exports.resolve = urlResolve;
  exports.resolveObject = urlResolveObject;
  exports.format = urlFormat;
  
+// Reference: RFC 3986, RFC 1808, RFC 2396
+
  // define these here so at least they only have to be
  // compiled once on the first module load.
-var protocolPattern = /^([a-z0-9]+:)/,
+var protocolPattern = /^([a-z0-9]+:)/i,
      portPattern = /:[0-9]+$/,
-    delims = ['<', '>', '"', '\'', '`', /\s/],
+    // RFC 2396: characters reserved for delimiting URLs.
+    delims = ['<', '>', '"', '`', ' ', '\r', '\n', '\t'],
+    // RFC 2396: characters not allowed for various reasons.
      unwise = ['{', '}', '|', '\\', '^', '~', '[', ']', '`'].concat(delims),
-    nonHostChars = ['/', '?', ';', '#'].concat(unwise),
+    // Allowed by RFCs, but cause of XSS attacks.  Always escape these.
+    autoEscape = ['\''],
+    // Characters that are never ever allowed in a hostname.
+    // Note that any invalid chars are also handled, but these
+    // are the ones that are *expected* to be seen, so we fast-path
+    // them.
+    nonHostChars = ['%', '/', '?', ';', '#']
+      .concat(unwise).concat(autoEscape),
      hostnameMaxLen = 255,
-    hostnamePartPattern = /^[a-z0-9][a-z0-9A-Z-]{0,62}$/,
+    hostnamePartPattern = /^[a-zA-Z0-9][a-z0-9A-Z-]{0,62}$/,
+    hostnamePartStart = /^([a-zA-Z0-9][a-z0-9A-Z-]{0,62})(.*)$/,
+    // protocols that can allow "unsafe" and "unwise" chars.
      unsafeProtocol = {
        'javascript': true,
        'javascript:': true
      },
+    // protocols that never have a hostname.
      hostlessProtocol = {
        'javascript': true,
        'javascript:': true,
        'file': true,
        'file:': true
      },
+    // protocols that always have a path component.
      pathedProtocol = {
        'http': true,
        'https': true,
@@ -54,6 +69,7 @@ var protocolPattern = /^([a-z0-9]+:)/,
        'gopher:': true,
        'file:': true
      },
+    // protocols that always contain a // bit.
      slashedProtocol = {
        'http': true,
        'https': true,
@@ -74,10 +90,19 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
    var out = {},
        rest = url;
  
+  // cut off any delimiters.
+  // This is to support parse stuff like "<http://foo.com>"
+  for (var i = 0, l = rest.length; i < l; i++) {
+    if (delims.indexOf(rest.charAt(i)) === -1) break;
+  }
+  if (i !== 0) rest = rest.substr(i);
+
+
    var proto = protocolPattern.exec(rest);
    if (proto) {
      proto = proto[0];
-    out.protocol = proto;
+    var lowerProto = proto.toLowerCase();
+    out.protocol = lowerProto;
      rest = rest.substr(proto.length);
    }
  
@@ -119,6 +144,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
        var key = keys[i];
        out[key] = p[key];
      }
+
      // we've indicated that there is a hostname,
      // so even if it's empty, it has to be present.
      out.hostname = out.hostname || '';
@@ -130,17 +156,49 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
        var hostparts = out.hostname.split(/\./);
        for (var i = 0, l = hostparts.length; i < l; i++) {
          var part = hostparts[i];
+        if (!part) continue;
          if (!part.match(hostnamePartPattern)) {
-          out.hostname = '';
+          var validParts = hostparts.slice(0, i);
+          var notHost = hostparts.slice(i + 1);
+          var bit = part.match(hostnamePartStart);
+          if (bit) {
+            validParts.push(bit[1]);
+            notHost.unshift(bit[2]);
+          }
+          if (notHost.length) {
+            rest = '/' + notHost.join('.') + rest
+          }
+          out.hostname = validParts.join('.');
            break;
          }
        }
      }
+    // hostnames are always lower case.
+    out.hostname = out.hostname.toLowerCase();
+
+    out.host = ((out.auth) ? out.auth + '@' : '') +
+        (out.hostname || '') +
+        ((out.port) ? ':' + out.port : '');
+    out.href += out.host;
    }
  
    // now rest is set to the post-host stuff.
    // chop off any delim chars.
-  if (!unsafeProtocol[proto]) {
+  if (!unsafeProtocol[lowerProto]) {
+
+    // First, make 100% sure that any "autoEscape" chars get
+    // escaped, even if encodeURIComponent doesn't think they
+    // need to be.
+    for (var i = 0, l = autoEscape.length; i < l; i++) {
+      var ae = autoEscape[i];
+      var esc = encodeURIComponent(ae);
+      if (esc === ae) {
+        esc = escape(ae);
+      }
+      rest = rest.split(ae).join(esc);
+    }
+
+    // Now make sure that delims never appear in a url.
      var chop = rest.length;
      for (var i = 0, l = delims.length; i < l; i++) {
        var c = rest.indexOf(delims[i]);
diff --git a/test/simple/test-url.js b/test/simple/test-url.js

index 4f3d139..e52dacd 100644 (file)
--- a/test/simple/test-url.js
+++ b/test/simple/test-url.js
@@ -32,6 +32,99 @@ var parseTests = {
      'href': '//some_path',
      'pathname': '//some_path'
    },
+  'HTTP://www.example.com/' : {
+    'href': 'http://www.example.com/',
+    'protocol': 'http:',
+    'host': 'www.example.com',
+    'hostname': 'www.example.com',
+    'pathname': '/'
+  },
+  'http://www.ExAmPlE.com/' : {
+    'href': 'http://www.example.com/',
+    'protocol': 'http:',
+    'host': 'www.example.com',
+    'hostname': 'www.example.com',
+    'pathname': '/'
+
+  },
+  'http://user:pw@www.ExAmPlE.com/' : {
+    'href': 'http://user:pw@www.example.com/',
+    'protocol': 'http:',
+    'auth': 'user:pw',
+    'host': 'user:pw@www.example.com',
+    'hostname': 'www.example.com',
+    'pathname': '/'
+
+  },
+  'http://USER:PW@www.ExAmPlE.com/' : {
+    'href': 'http://USER:PW@www.example.com/',
+    'protocol': 'http:',
+    'auth': 'USER:PW',
+    'host': 'USER:PW@www.example.com',
+    'hostname': 'www.example.com',
+    'pathname': '/'
+  },
+  'http://x.com/path?that\'s#all, folks' : {
+    'href': 'http://x.com/path?that%27s#all,',
+    'protocol': 'http:',
+    'host': 'x.com',
+    'hostname': 'x.com',
+    'search': '?that%27s',
+    'query': 'that%27s',
+    'pathname': '/path',
+    'hash': '#all,'
+  },
+  'HTTP://X.COM/Y' : {
+    'href': 'http://x.com/Y',
+    'protocol': 'http:',
+    'host': 'x.com',
+    'hostname': 'x.com',
+    'pathname': '/Y',
+  },
+  // an unexpected invalid char in the hostname.
+  'HtTp://x.y.cOm*a/b/c?d=e#f g<h>i' : {
+    'href': 'http://x.y.com/*a/b/c?d=e#f',
+    'protocol': 'http:',
+    'host': 'x.y.com',
+    'hostname': 'x.y.com',
+    'pathname': '/*a/b/c',
+    'search': '?d=e',
+    'query': 'd=e',
+    'hash': '#f'
+  },
+  // make sure that we don't accidentally lcast the path parts.
+  'HtTp://x.y.cOm*A/b/c?d=e#f g<h>i' : {
+    'href': 'http://x.y.com/*A/b/c?d=e#f',
+    'protocol': 'http:',
+    'host': 'x.y.com',
+    'hostname': 'x.y.com',
+    'pathname': '/*A/b/c',
+    'search': '?d=e',
+    'query': 'd=e',
+    'hash': '#f'
+  },
+  'http://x...y...#p': {
+    'href': 'http://x...y.../#p',
+    'protocol': 'http:',
+    'host': 'x...y...',
+    'hostname': 'x...y...',
+    'hash': '#p',
+    'pathname': '/'
+  },
+  'http://x/p/"quoted"': {
+    'href': 'http://x/p/',
+    'protocol':'http:',
+    'host': 'x',
+    'hostname': 'x',
+    'pathname': '/p/'
+  },
+  '<http://goo.corn/bread> Is a URL!': {
+    'href': 'http://goo.corn/bread',
+    'protocol': 'http:',
+    'host': 'goo.corn',
+    'hostname': 'goo.corn',
+    'pathname': '/bread'
+  },
    'http://www.narwhaljs.org/blog/categories?id=news' : {
      'href': 'http://www.narwhaljs.org/blog/categories?id=news',
      'protocol': 'http:',
@@ -58,17 +151,18 @@ var parseTests = {
      'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
      'pathname': '/vt/lyrs=m@114'
    },
-  'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=' : {
-    'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' +
-        '&hl=en&src=api&x=2&y=2&z=3&s=',
-    'protocol': 'http:',
-    'host': 'user:pass@mt0.google.com',
-    'auth': 'user:pass',
-    'hostname': 'mt0.google.com',
-    'search': '???&hl=en&src=api&x=2&y=2&z=3&s=',
-    'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
-    'pathname': '/vt/lyrs=m@114'
-  },
+  'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=':
+      {
+        'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' +
+            '&hl=en&src=api&x=2&y=2&z=3&s=',
+        'protocol': 'http:',
+        'host': 'user:pass@mt0.google.com',
+        'auth': 'user:pass',
+        'hostname': 'mt0.google.com',
+        'search': '???&hl=en&src=api&x=2&y=2&z=3&s=',
+        'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
+        'pathname': '/vt/lyrs=m@114'
+      },
    'file:///etc/passwd' : {
      'href': 'file:///etc/passwd',
      'protocol': 'file:',
@@ -154,7 +248,7 @@ for (var u in parseTests) {
                   'parse(' + u + ').' + i + ' == ' + e + '\nactual: ' + a);
    }
  
-  var expected = u,
+  var expected = parseTests[u].href,
        actual = url.format(parseTests[u]);
  
    assert.equal(expected, actual,
author	isaacs <i@izs.me>
	Wed, 20 Apr 2011 22:44:34 +0000 (15:44 -0700)
committer	isaacs <i@izs.me>
	Wed, 20 Apr 2011 22:44:34 +0000 (15:44 -0700)
lib/url.js		patch \| blob \| history
test/simple/test-url.js		patch \| blob \| history