From 2c3b392c8cde19433b8a88ce3896731dfa57b482 Mon Sep 17 00:00:00 2001
From: olehougaard <olehougaard@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Fri, 19 Dec 2008 12:06:11 +0000
Subject: [PATCH] Handling byte-order marks as specified in Ecmascript-262 and
 in compliance with Safari. Review URL: http://codereview.chromium.org/15075

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1006 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---
 src/scanner.cc      | 41 ++++++++++++++++++-------------------
 test/mjsunit/bom.js | 50 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 21 deletions(-)
 create mode 100755 test/mjsunit/bom.js

diff --git a/src/scanner.cc b/src/scanner.cc
index 3ae6936f2..e95fe9311 100644
--- a/src/scanner.cc
+++ b/src/scanner.cc
@@ -119,6 +119,18 @@ void UTF16Buffer::PushBack(uc32 ch) {
 }
 
 
+static inline bool IsByteOrderMark(uc32 c) {
+  // The Unicode value U+FFFE is guaranteed never to be assigned as a
+  // Unicode character; this implies that in a Unicode context the
+  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
+  // character expressed in little-endian byte order (since it could
+  // not be a U+FFFE character expressed in big-endian byte
+  // order). Nevertheless, we check for it to be compatible with
+  // Spidermonkey.
+  return c == 0xFEFF || c == 0xFFFE;
+}
+
+
 uc32 UTF16Buffer::Advance() {
   // NOTE: It is of importance to Persian / Farsi resources that we do
   // *not* strip format control characters in the scanner; see
@@ -126,16 +138,17 @@ uc32 UTF16Buffer::Advance() {
   //    https://bugzilla.mozilla.org/show_bug.cgi?id=274152
   //
   // So, even though ECMA-262, section 7.1, page 11, dictates that we
-  // must remove Unicode format-control characters, we do not. This is
-  // in line with how IE and SpiderMonkey handles it.
+  // must remove Unicode format-control characters, we only remove the BOM.
+  // This is in line with how Safari handles it.
   if (!pushback_buffer()->is_empty()) {
     pos_++;
     return last_ = pushback_buffer()->RemoveLast();
-  } else if (stream_->has_more()) {
-    pos_++;
-    uc32 next = stream_->GetNext();
-    return last_ = next;
   } else {
+    while (stream_->has_more()) {
+      pos_++;
+      uc32 next = stream_->GetNext();
+      if (!IsByteOrderMark(next)) return last_ = next;
+    }
     // note: currently the following increment is necessary to avoid a
     // test-parser problem!
     pos_++;
@@ -234,25 +247,11 @@ void Scanner::PushBack(uc32 ch) {
 }
 
 
-static inline bool IsByteOrderMark(uc32 c) {
-  // The Unicode value U+FFFE is guaranteed never to be assigned as a
-  // Unicode character; this implies that in a Unicode context the
-  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
-  // character expressed in little-endian byte order (since it could
-  // not be a U+FFFE character expressed in big-endian byte
-  // order). Nevertheless, we check for it to be compatible with
-  // Spidermonkey.
-  return c == 0xFEFF || c == 0xFFFE;
-}
-
-
 void Scanner::SkipWhiteSpace(bool initial) {
   has_line_terminator_before_next_ = initial;
 
   while (true) {
-    // We treat byte-order marks (BOMs) as whitespace for better
-    // compatibility with Spidermonkey and other JavaScript engines.
-    while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
+    while (kIsWhiteSpace.get(c0_)) {
       // IsWhiteSpace() includes line terminators!
       if (kIsLineTerminator.get(c0_))
         // Ignore line terminators, but remember them. This is necessary
diff --git a/test/mjsunit/bom.js b/test/mjsunit/bom.js
new file mode 100755
index 000000000..4d6974d19
--- /dev/null
+++ b/test/mjsunit/bom.js
@@ -0,0 +1,50 @@
+ï»¿// Copyright 2008 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// According to section 7.1 of EcmaScript-262 format control characters
+// should be removed before parsing. We're following the discussion at
+// https://bugs.webkit.org/show_bug.cgi?id=4931 in only removing the BOM.
+// See also https://bugzilla.mozilla.org/show_bug.cgi?id=274152.
+
+// Ignores BOM (and only BOM) in string literals.
+var format_controls =
+  eval('"\uFEFF\u200F\u200E\u00AD\u2062\u200D\u200C\u200B"');
+assertEquals('\u200F\u200E\u00AD\u2062\u200D\u200C\u200B', 
+             format_controls);
+
+// Ignores BOM in identifiers.
+eval('var x\uFEFFy = 7');
+assertEquals(7, xy);
+
+// Doesn't ignore non-BOM format control characters.
+assertThrows('var y\u200Fx = 7');
+assertThrows('var y\u200Ex = 7');
+assertThrows('var y\u20ADx = 7');
+assertThrows('var y\u2062x = 7');
+assertThrows('var y\u200Dx = 7');
+assertThrows('var y\u200Cx = 7');
+assertThrows('var y\u200Bx = 7');
-- 
2.34.1