url: Properly parse certain oddly formed urls

author isaacs <i@izs.me>

Mon, 3 Jun 2013 20:39:57 +0000 (13:39 -0700)

committer isaacs <i@izs.me>

Mon, 3 Jun 2013 22:56:16 +0000 (15:56 -0700)
author isaacs <i@izs.me>
Mon, 3 Jun 2013 20:39:57 +0000 (13:39 -0700)
committer isaacs <i@izs.me>
Mon, 3 Jun 2013 22:56:16 +0000 (15:56 -0700)
diff --git a/lib/url.js b/lib/url.js

index b8ba3fb1dd76ba95084f24ef24221d15e455fd64..db7723895b662957bf69fa3563f09d96876443c9 100644 (file)
--- a/lib/url.js
+++ b/lib/url.js
@@ -64,7 +64,7 @@ var protocolPattern = /^([a-z0-9.+-]+:)/i,
      // them.
      nonHostChars = ['%', '/', '?', ';', '#']
        .concat(unwise).concat(autoEscape),
-    nonAuthChars = ['/', '@', '?', '#'].concat(delims),
+    hostEndingChars = ['/', '?', '#'],
      hostnameMaxLen = 255,
      hostnamePartPattern = /^[a-z0-9A-Z_-]{0,63}$/,
      hostnamePartStart = /^([a-z0-9A-Z_-]{0,63})(.*)$/,
@@ -146,49 +146,63 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) {
  
    if (!hostlessProtocol[proto] &&
        (slashes || (proto && !slashedProtocol[proto]))) {
+
      // there's a hostname.
      // the first instance of /, ?, ;, or # ends the host.
-    // don't enforce full RFC correctness, just be unstupid about it.
-
+    //
      // If there is an @ in the hostname, then non-host chars *are* allowed
-    // to the left of the first @ sign, unless some non-auth character
+    // to the left of the last @ sign, unless some host-ending character
      // comes *before* the @-sign.
      // URLs are obnoxious.
-    var atSign = rest.indexOf('@');
-    if (atSign !== -1) {
-      var auth = rest.slice(0, atSign);
-
-      // there *may be* an auth
-      var hasAuth = true;
-      for (var i = 0, l = nonAuthChars.length; i < l; i++) {
-        if (auth.indexOf(nonAuthChars[i]) !== -1) {
-          // not a valid auth.  Something like http://foo.com/bar@baz/
-          hasAuth = false;
-          break;
-        }
-      }
+    //
+    // ex:
+    // http://a@b@c/ => user:a@b host:c
+    // http://a@b?@c => user:a host:c path:/?@c
+
+    // v0.12 TODO(isaacs): This is not quite how Chrome does things.
+    // Review our test case against browsers more comprehensively.
+
+    // find the first instance of any hostEndingChars
+    var hostEnd = -1;
+    for (var i = 0; i < hostEndingChars.length; i++) {
+      var hec = rest.indexOf(hostEndingChars[i]);
+      if (hec !== -1 && (hostEnd === -1 || hec < hostEnd))
+        hostEnd = hec;
+    }
  
-      if (hasAuth) {
-        // pluck off the auth portion.
-        this.auth = decodeURIComponent(auth);
-        rest = rest.substr(atSign + 1);
-      }
+    // at this point, either we have an explicit point where the
+    // auth portion cannot go past, or the last @ char is the decider.
+    var auth, atSign;
+    if (hostEnd === -1) {
+      // atSign can be anywhere.
+      atSign = rest.lastIndexOf('@');
+    } else {
+      // atSign must be in auth portion.
+      // http://a@b/c@d => host:b auth:a path:/c@d
+      atSign = rest.lastIndexOf('@', hostEnd);
      }
  
-    var firstNonHost = -1;
-    for (var i = 0, l = nonHostChars.length; i < l; i++) {
-      var index = rest.indexOf(nonHostChars[i]);
-      if (index !== -1 &&
-          (firstNonHost < 0 || index < firstNonHost)) firstNonHost = index;
+    // Now we have a portion which is definitely the auth.
+    // Pull that off.
+    if (atSign !== -1) {
+      auth = rest.slice(0, atSign);
+      rest = rest.slice(atSign + 1);
+      this.auth = decodeURIComponent(auth);
      }
  
-    if (firstNonHost !== -1) {
-      this.host = rest.substr(0, firstNonHost);
-      rest = rest.substr(firstNonHost);
-    } else {
-      this.host = rest;
-      rest = '';
+    // the host is the remaining to the left of the first non-host char
+    hostEnd = -1;
+    for (var i = 0; i < nonHostChars.length; i++) {
+      var hec = rest.indexOf(nonHostChars[i]);
+      if (hec !== -1 && (hostEnd === -1 || hec < hostEnd))
+        hostEnd = hec;
      }
+    // if we still have not hit it, then the entire thing is a host.
+    if (hostEnd === -1)
+      hostEnd = rest.length;
+
+    this.host = rest.slice(0, hostEnd);
+    rest = rest.slice(hostEnd);
  
      // pull out port.
      this.parseHost();
diff --git a/test/simple/test-url.js b/test/simple/test-url.js

index 6630da1025e2a77b1f3baa9d7c0ca2088e6b1924..d27abbab8a30ee091c44b831e6ffd55888905c98 100644 (file)
--- a/test/simple/test-url.js
+++ b/test/simple/test-url.js
@@ -741,6 +741,45 @@ var parseTests = {
      'path': '/test',
    },
  
+  'http://a@b@c/': {
+    protocol: 'http:',
+    slashes: true,
+    auth: 'a@b',
+    host: 'c',
+    hostname: 'c',
+    href: 'http://a%40b@c/',
+    path: '/',
+    pathname: '/'
+  },
+
+  'http://a@b?@c': {
+    protocol: 'http:',
+    slashes: true,
+    auth: 'a',
+    host: 'b',
+    hostname: 'b',
+    href: 'http://a@b/?@c',
+    path: '/?@c',
+    pathname: '/',
+    search: '?@c',
+    query: '@c'
+  },
+
+  'http://a\r" \t\n<\'b:b@c\r\nd/e?f':{
+    protocol: 'http:',
+    slashes: true,
+    auth: 'a\r" \t\n<\'b:b',
+    host: 'c',
+    port: null,
+    hostname: 'c',
+    hash: null,
+    search: '?f',
+    query: 'f',
+    pathname: '%0D%0Ad/e',
+    path: '%0D%0Ad/e?f',
+    href: 'http://a%0D%22%20%09%0A%3C\'b:b@c/%0D%0Ad/e?f'
+  }
+
  };
  
  for (var u in parseTests) {
author	isaacs <i@izs.me>
	Mon, 3 Jun 2013 20:39:57 +0000 (13:39 -0700)
committer	isaacs <i@izs.me>
	Mon, 3 Jun 2013 22:56:16 +0000 (15:56 -0700)
lib/url.js		patch \| blob \| history
test/simple/test-url.js		patch \| blob \| history