From 13bbc2066cd794842a582639c571a1573a8998d8 Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Mon, 30 Sep 2013 22:50:08 +0200
Subject: [PATCH] ignore UTF-8 BOMs at the beginning of source files

---
 Cython/Utils.py           | 25 +++++++++++++++++++++----
 runtests.py               |  9 +++++++--
 tests/compile/utf8bom.pyx |  8 ++++++++
 3 files changed, 36 insertions(+), 6 deletions(-)
 create mode 100644 tests/compile/utf8bom.pyx

diff --git a/Cython/Utils.py b/Cython/Utils.py
index b9fd194..8594118 100644
--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -215,6 +215,17 @@ def detect_opened_file_encoding(f):
                 return encoding.group(1)
     return "UTF-8"
 
+
+def skip_bom(f):
+    """
+    Read past a BOM at the beginning of a source file.
+    This could be added to the scanner, but it's *substantially* easier
+    to keep it at this level.
+    """
+    if f.read(1) != u'\uFEFF':
+        f.seek(0)
+
+
 normalise_newlines = re.compile(u'\r\n?|\n').sub
 
 
@@ -264,6 +275,7 @@ if sys.version_info >= (2,6):
     except ImportError:
         pass
 
+
 def open_source_file(source_filename, mode="r",
                      encoding=None, error_handling=None,
                      require_normalised_newlines=True):
@@ -272,8 +284,11 @@ def open_source_file(source_filename, mode="r",
         # it's UTF-8.
         f = open_source_file(source_filename, encoding="UTF-8", mode=mode, error_handling='ignore')
         encoding = detect_opened_file_encoding(f)
-        if encoding == "UTF-8" and error_handling=='ignore' and require_normalised_newlines:
+        if (encoding == "UTF-8"
+                and error_handling == 'ignore'
+                and require_normalised_newlines):
             f.seek(0)
+            skip_bom(f)
             return f
         else:
             f.close()
@@ -290,15 +305,17 @@ def open_source_file(source_filename, mode="r",
             pass
     #
     if io is not None:
-        return io.open(source_filename, mode=mode,
-                       encoding=encoding, errors=error_handling)
+        stream = io.open(source_filename, mode=mode,
+                         encoding=encoding, errors=error_handling)
     else:
         # codecs module doesn't have universal newline support
         stream = codecs.open(source_filename, mode=mode,
                              encoding=encoding, errors=error_handling)
         if require_normalised_newlines:
             stream = NormalisedNewlineStream(stream)
-        return stream
+    skip_bom(stream)
+    return stream
+
 
 def open_source_from_loader(loader,
                             source_filename,
diff --git a/runtests.py b/runtests.py
index 0e9f7c1..5b6a7da 100755
--- a/runtests.py
+++ b/runtests.py
@@ -277,6 +277,9 @@ TEST_SUPPORT_DIR = 'testsupport'
 
 BACKENDS = ['c', 'cpp']
 
+UTF8_BOM_BYTES = r'\xef\xbb\xbf'.encode('ISO-8859-1').decode('unicode_escape')
+
+
 def memoize(f):
     uncomputed = object()
     f._cache = {}
@@ -287,13 +290,15 @@ def memoize(f):
         return res
     return func
 
+
 @memoize
 def parse_tags(filepath):
     tags = defaultdict(list)
-    f = io_open(filepath, encoding='ISO-8859-1', errors='replace')
+    f = io_open(filepath, encoding='ISO-8859-1', errors='ignore')
     try:
         for line in f:
-            line = line.strip()
+            # ignore BOM-like bytes and whitespace
+            line = line.lstrip(UTF8_BOM_BYTES).strip()
             if not line:
                 continue
             if line[0] != '#':
diff --git a/tests/compile/utf8bom.pyx b/tests/compile/utf8bom.pyx
new file mode 100644
index 0000000..c696c8c
--- /dev/null
+++ b/tests/compile/utf8bom.pyx
@@ -0,0 +1,8 @@
+ï»¿# coding: utf-8
+# mode: compile
+
+# this file starts with a UTF-8 encoded BOM
+# the only thing we test is that it properly compiles
+
+def test():
+    pass
-- 
2.7.4