}
-static inline bool IsByteOrderMark(uc32 c) {
- // The Unicode value U+FFFE is guaranteed never to be assigned as a
- // Unicode character; this implies that in a Unicode context the
- // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
- // character expressed in little-endian byte order (since it could
- // not be a U+FFFE character expressed in big-endian byte
- // order). Nevertheless, we check for it to be compatible with
- // Spidermonkey.
- return c == 0xFEFF || c == 0xFFFE;
-}
-
-
uc32 UTF16Buffer::Advance() {
// NOTE: It is of importance to Persian / Farsi resources that we do
// *not* strip format control characters in the scanner; see
// https://bugzilla.mozilla.org/show_bug.cgi?id=274152
//
// So, even though ECMA-262, section 7.1, page 11, dictates that we
- // must remove Unicode format-control characters, we only remove the BOM.
- // This is in line with how Safari handles it.
+ // must remove Unicode format-control characters, we do not. This is
+ // in line with how IE and SpiderMonkey handles it.
if (!pushback_buffer()->is_empty()) {
pos_++;
return last_ = pushback_buffer()->RemoveLast();
+ } else if (stream_->has_more()) {
+ pos_++;
+ uc32 next = stream_->GetNext();
+ return last_ = next;
} else {
- while (stream_->has_more()) {
- pos_++;
- uc32 next = stream_->GetNext();
- if (!IsByteOrderMark(next)) return last_ = next;
- }
// note: currently the following increment is necessary to avoid a
// test-parser problem!
pos_++;
}
+static inline bool IsByteOrderMark(uc32 c) {
+ // The Unicode value U+FFFE is guaranteed never to be assigned as a
+ // Unicode character; this implies that in a Unicode context the
+ // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
+ // character expressed in little-endian byte order (since it could
+ // not be a U+FFFE character expressed in big-endian byte
+ // order). Nevertheless, we check for it to be compatible with
+ // Spidermonkey.
+ return c == 0xFEFF || c == 0xFFFE;
+}
+
+
void Scanner::SkipWhiteSpace(bool initial) {
has_line_terminator_before_next_ = initial;
while (true) {
- while (kIsWhiteSpace.get(c0_)) {
+ // We treat byte-order marks (BOMs) as whitespace for better
+ // compatibility with Spidermonkey and other JavaScript engines.
+ while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
// IsWhiteSpace() includes line terminators!
if (kIsLineTerminator.get(c0_))
// Ignore line terminators, but remember them. This is necessary
+++ /dev/null
-// Copyright 2008 the V8 project authors. All rights reserved.\r
-// Redistribution and use in source and binary forms, with or without\r
-// modification, are permitted provided that the following conditions are\r
-// met:\r
-//\r
-// * Redistributions of source code must retain the above copyright\r
-// notice, this list of conditions and the following disclaimer.\r
-// * Redistributions in binary form must reproduce the above\r
-// copyright notice, this list of conditions and the following\r
-// disclaimer in the documentation and/or other materials provided\r
-// with the distribution.\r
-// * Neither the name of Google Inc. nor the names of its\r
-// contributors may be used to endorse or promote products derived\r
-// from this software without specific prior written permission.\r
-//\r
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
-\r
-// According to section 7.1 of EcmaScript-262 format control characters\r
-// should be removed before parsing. We're following the discussion at\r
-// https://bugs.webkit.org/show_bug.cgi?id=4931 in only removing the BOM.\r
-// See also https://bugzilla.mozilla.org/show_bug.cgi?id=274152.\r
-\r
-// Ignores BOM (and only BOM) in string literals.\r
-var format_controls =\r
- eval('"\uFEFF\u200F\u200E\u00AD\u2062\u200D\u200C\u200B"');\r
-assertEquals('\u200F\u200E\u00AD\u2062\u200D\u200C\u200B', \r
- format_controls);\r
-\r
-// Ignores BOM in identifiers.\r
-eval('var x\uFEFFy = 7');\r
-assertEquals(7, xy);\r
-\r
-// Doesn't ignore non-BOM format control characters.\r
-assertThrows('var y\u200Fx = 7');\r
-assertThrows('var y\u200Ex = 7');\r
-assertThrows('var y\u20ADx = 7');\r
-assertThrows('var y\u2062x = 7');\r
-assertThrows('var y\u200Dx = 7');\r
-assertThrows('var y\u200Cx = 7');\r
-assertThrows('var y\u200Bx = 7');\r