From 2c3b392c8cde19433b8a88ce3896731dfa57b482 Mon Sep 17 00:00:00 2001 From: olehougaard Date: Fri, 19 Dec 2008 12:06:11 +0000 Subject: [PATCH] Handling byte-order marks as specified in Ecmascript-262 and in compliance with Safari. Review URL: http://codereview.chromium.org/15075 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1006 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/scanner.cc | 41 ++++++++++++++++++------------------- test/mjsunit/bom.js | 50 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 21 deletions(-) create mode 100755 test/mjsunit/bom.js diff --git a/src/scanner.cc b/src/scanner.cc index 3ae6936f2..e95fe9311 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -119,6 +119,18 @@ void UTF16Buffer::PushBack(uc32 ch) { } +static inline bool IsByteOrderMark(uc32 c) { + // The Unicode value U+FFFE is guaranteed never to be assigned as a + // Unicode character; this implies that in a Unicode context the + // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF + // character expressed in little-endian byte order (since it could + // not be a U+FFFE character expressed in big-endian byte + // order). Nevertheless, we check for it to be compatible with + // Spidermonkey. + return c == 0xFEFF || c == 0xFFFE; +} + + uc32 UTF16Buffer::Advance() { // NOTE: It is of importance to Persian / Farsi resources that we do // *not* strip format control characters in the scanner; see @@ -126,16 +138,17 @@ uc32 UTF16Buffer::Advance() { // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 // // So, even though ECMA-262, section 7.1, page 11, dictates that we - // must remove Unicode format-control characters, we do not. This is - // in line with how IE and SpiderMonkey handles it. + // must remove Unicode format-control characters, we only remove the BOM. + // This is in line with how Safari handles it. if (!pushback_buffer()->is_empty()) { pos_++; return last_ = pushback_buffer()->RemoveLast(); - } else if (stream_->has_more()) { - pos_++; - uc32 next = stream_->GetNext(); - return last_ = next; } else { + while (stream_->has_more()) { + pos_++; + uc32 next = stream_->GetNext(); + if (!IsByteOrderMark(next)) return last_ = next; + } // note: currently the following increment is necessary to avoid a // test-parser problem! pos_++; @@ -234,25 +247,11 @@ void Scanner::PushBack(uc32 ch) { } -static inline bool IsByteOrderMark(uc32 c) { - // The Unicode value U+FFFE is guaranteed never to be assigned as a - // Unicode character; this implies that in a Unicode context the - // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF - // character expressed in little-endian byte order (since it could - // not be a U+FFFE character expressed in big-endian byte - // order). Nevertheless, we check for it to be compatible with - // Spidermonkey. - return c == 0xFEFF || c == 0xFFFE; -} - - void Scanner::SkipWhiteSpace(bool initial) { has_line_terminator_before_next_ = initial; while (true) { - // We treat byte-order marks (BOMs) as whitespace for better - // compatibility with Spidermonkey and other JavaScript engines. - while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { + while (kIsWhiteSpace.get(c0_)) { // IsWhiteSpace() includes line terminators! if (kIsLineTerminator.get(c0_)) // Ignore line terminators, but remember them. This is necessary diff --git a/test/mjsunit/bom.js b/test/mjsunit/bom.js new file mode 100755 index 000000000..4d6974d19 --- /dev/null +++ b/test/mjsunit/bom.js @@ -0,0 +1,50 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// According to section 7.1 of EcmaScript-262 format control characters +// should be removed before parsing. We're following the discussion at +// https://bugs.webkit.org/show_bug.cgi?id=4931 in only removing the BOM. +// See also https://bugzilla.mozilla.org/show_bug.cgi?id=274152. + +// Ignores BOM (and only BOM) in string literals. +var format_controls = + eval('"\uFEFF\u200F\u200E\u00AD\u2062\u200D\u200C\u200B"'); +assertEquals('\u200F\u200E\u00AD\u2062\u200D\u200C\u200B', + format_controls); + +// Ignores BOM in identifiers. +eval('var x\uFEFFy = 7'); +assertEquals(7, xy); + +// Doesn't ignore non-BOM format control characters. +assertThrows('var y\u200Fx = 7'); +assertThrows('var y\u200Ex = 7'); +assertThrows('var y\u20ADx = 7'); +assertThrows('var y\u2062x = 7'); +assertThrows('var y\u200Dx = 7'); +assertThrows('var y\u200Cx = 7'); +assertThrows('var y\u200Bx = 7'); -- 2.34.1