From 13bbc2066cd794842a582639c571a1573a8998d8 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Mon, 30 Sep 2013 22:50:08 +0200 Subject: [PATCH] ignore UTF-8 BOMs at the beginning of source files --- Cython/Utils.py | 25 +++++++++++++++++++++---- runtests.py | 9 +++++++-- tests/compile/utf8bom.pyx | 8 ++++++++ 3 files changed, 36 insertions(+), 6 deletions(-) create mode 100644 tests/compile/utf8bom.pyx diff --git a/Cython/Utils.py b/Cython/Utils.py index b9fd194..8594118 100644 --- a/Cython/Utils.py +++ b/Cython/Utils.py @@ -215,6 +215,17 @@ def detect_opened_file_encoding(f): return encoding.group(1) return "UTF-8" + +def skip_bom(f): + """ + Read past a BOM at the beginning of a source file. + This could be added to the scanner, but it's *substantially* easier + to keep it at this level. + """ + if f.read(1) != u'\uFEFF': + f.seek(0) + + normalise_newlines = re.compile(u'\r\n?|\n').sub @@ -264,6 +275,7 @@ if sys.version_info >= (2,6): except ImportError: pass + def open_source_file(source_filename, mode="r", encoding=None, error_handling=None, require_normalised_newlines=True): @@ -272,8 +284,11 @@ def open_source_file(source_filename, mode="r", # it's UTF-8. f = open_source_file(source_filename, encoding="UTF-8", mode=mode, error_handling='ignore') encoding = detect_opened_file_encoding(f) - if encoding == "UTF-8" and error_handling=='ignore' and require_normalised_newlines: + if (encoding == "UTF-8" + and error_handling == 'ignore' + and require_normalised_newlines): f.seek(0) + skip_bom(f) return f else: f.close() @@ -290,15 +305,17 @@ def open_source_file(source_filename, mode="r", pass # if io is not None: - return io.open(source_filename, mode=mode, - encoding=encoding, errors=error_handling) + stream = io.open(source_filename, mode=mode, + encoding=encoding, errors=error_handling) else: # codecs module doesn't have universal newline support stream = codecs.open(source_filename, mode=mode, encoding=encoding, errors=error_handling) if require_normalised_newlines: stream = NormalisedNewlineStream(stream) - return stream + skip_bom(stream) + return stream + def open_source_from_loader(loader, source_filename, diff --git a/runtests.py b/runtests.py index 0e9f7c1..5b6a7da 100755 --- a/runtests.py +++ b/runtests.py @@ -277,6 +277,9 @@ TEST_SUPPORT_DIR = 'testsupport' BACKENDS = ['c', 'cpp'] +UTF8_BOM_BYTES = r'\xef\xbb\xbf'.encode('ISO-8859-1').decode('unicode_escape') + + def memoize(f): uncomputed = object() f._cache = {} @@ -287,13 +290,15 @@ def memoize(f): return res return func + @memoize def parse_tags(filepath): tags = defaultdict(list) - f = io_open(filepath, encoding='ISO-8859-1', errors='replace') + f = io_open(filepath, encoding='ISO-8859-1', errors='ignore') try: for line in f: - line = line.strip() + # ignore BOM-like bytes and whitespace + line = line.lstrip(UTF8_BOM_BYTES).strip() if not line: continue if line[0] != '#': diff --git a/tests/compile/utf8bom.pyx b/tests/compile/utf8bom.pyx new file mode 100644 index 0000000..c696c8c --- /dev/null +++ b/tests/compile/utf8bom.pyx @@ -0,0 +1,8 @@ +# coding: utf-8 +# mode: compile + +# this file starts with a UTF-8 encoded BOM +# the only thing we test is that it properly compiles + +def test(): + pass -- 2.7.4