From ed7d9e9c5ba3668afa374f4773b20c0698efdeca Mon Sep 17 00:00:00 2001 From: olehougaard Date: Fri, 19 Dec 2008 13:28:56 +0000 Subject: [PATCH] Reverting the BOM changes due to security concerns. Review URL: http://codereview.chromium.org/14890 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1011 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/scanner.cc | 41 +++++++++++++++++++------------------ test/mjsunit/bom.js | 50 --------------------------------------------- 2 files changed, 21 insertions(+), 70 deletions(-) delete mode 100755 test/mjsunit/bom.js diff --git a/src/scanner.cc b/src/scanner.cc index e95fe9311..3ae6936f2 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -119,18 +119,6 @@ void UTF16Buffer::PushBack(uc32 ch) { } -static inline bool IsByteOrderMark(uc32 c) { - // The Unicode value U+FFFE is guaranteed never to be assigned as a - // Unicode character; this implies that in a Unicode context the - // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF - // character expressed in little-endian byte order (since it could - // not be a U+FFFE character expressed in big-endian byte - // order). Nevertheless, we check for it to be compatible with - // Spidermonkey. - return c == 0xFEFF || c == 0xFFFE; -} - - uc32 UTF16Buffer::Advance() { // NOTE: It is of importance to Persian / Farsi resources that we do // *not* strip format control characters in the scanner; see @@ -138,17 +126,16 @@ uc32 UTF16Buffer::Advance() { // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 // // So, even though ECMA-262, section 7.1, page 11, dictates that we - // must remove Unicode format-control characters, we only remove the BOM. - // This is in line with how Safari handles it. + // must remove Unicode format-control characters, we do not. This is + // in line with how IE and SpiderMonkey handles it. if (!pushback_buffer()->is_empty()) { pos_++; return last_ = pushback_buffer()->RemoveLast(); + } else if (stream_->has_more()) { + pos_++; + uc32 next = stream_->GetNext(); + return last_ = next; } else { - while (stream_->has_more()) { - pos_++; - uc32 next = stream_->GetNext(); - if (!IsByteOrderMark(next)) return last_ = next; - } // note: currently the following increment is necessary to avoid a // test-parser problem! pos_++; @@ -247,11 +234,25 @@ void Scanner::PushBack(uc32 ch) { } +static inline bool IsByteOrderMark(uc32 c) { + // The Unicode value U+FFFE is guaranteed never to be assigned as a + // Unicode character; this implies that in a Unicode context the + // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF + // character expressed in little-endian byte order (since it could + // not be a U+FFFE character expressed in big-endian byte + // order). Nevertheless, we check for it to be compatible with + // Spidermonkey. + return c == 0xFEFF || c == 0xFFFE; +} + + void Scanner::SkipWhiteSpace(bool initial) { has_line_terminator_before_next_ = initial; while (true) { - while (kIsWhiteSpace.get(c0_)) { + // We treat byte-order marks (BOMs) as whitespace for better + // compatibility with Spidermonkey and other JavaScript engines. + while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { // IsWhiteSpace() includes line terminators! if (kIsLineTerminator.get(c0_)) // Ignore line terminators, but remember them. This is necessary diff --git a/test/mjsunit/bom.js b/test/mjsunit/bom.js deleted file mode 100755 index 4d6974d19..000000000 --- a/test/mjsunit/bom.js +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2008 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// According to section 7.1 of EcmaScript-262 format control characters -// should be removed before parsing. We're following the discussion at -// https://bugs.webkit.org/show_bug.cgi?id=4931 in only removing the BOM. -// See also https://bugzilla.mozilla.org/show_bug.cgi?id=274152. - -// Ignores BOM (and only BOM) in string literals. -var format_controls = - eval('"\uFEFF\u200F\u200E\u00AD\u2062\u200D\u200C\u200B"'); -assertEquals('\u200F\u200E\u00AD\u2062\u200D\u200C\u200B', - format_controls); - -// Ignores BOM in identifiers. -eval('var x\uFEFFy = 7'); -assertEquals(7, xy); - -// Doesn't ignore non-BOM format control characters. -assertThrows('var y\u200Fx = 7'); -assertThrows('var y\u200Ex = 7'); -assertThrows('var y\u20ADx = 7'); -assertThrows('var y\u2062x = 7'); -assertThrows('var y\u200Dx = 7'); -assertThrows('var y\u200Cx = 7'); -assertThrows('var y\u200Bx = 7'); -- 2.34.1