1 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4 """Collection of functions and classes to fix various encoding problems on
5 multiple platforms with python.
14 def complain(message):
15 """If any exception occurs in this file, we'll probably try to print it
16 on stderr, which makes for frustrating debugging if stderr is directed
17 to our wrapper. So be paranoid about catching errors and reporting them
18 to sys.__stderr__, so that the user has a higher chance to see them.
20 print(isinstance(message, str) and message or repr(message),
24 def fix_default_encoding():
25 """Forces utf8 solidly on all platforms.
27 By default python execution environment is lazy and defaults to ascii
30 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
32 if sys.getdefaultencoding() == 'utf-8':
35 # Regenerate setdefaultencoding.
37 # Module 'sys' has no 'setdefaultencoding' member
38 # pylint: disable=no-member
39 sys.setdefaultencoding('utf-8')
40 for attr in dir(locale):
41 if attr[0:3] != 'LC_':
43 aref = getattr(locale, attr)
45 locale.setlocale(aref, '')
49 lang, _ = locale.getdefaultlocale()
50 except (TypeError, ValueError):
54 locale.setlocale(aref, (lang, 'UTF-8'))
56 os.environ[attr] = lang + '.UTF-8'
58 locale.setlocale(locale.LC_ALL, '')
64 ###############################
69 """Works around <http://bugs.python.org/issue6058>."""
70 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
72 codecs.lookup('cp65001')
76 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
80 class WinUnicodeOutputBase(object):
81 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
84 Setting encoding to utf-8 is recommended.
86 def __init__(self, fileno, name, encoding):
87 # Corresponding file handle.
89 self.encoding = encoding
93 self.softspace = False
101 # Don't really close the handle, that would only cause problems.
108 raise NotImplementedError()
110 def write(self, text):
111 raise NotImplementedError()
113 def writelines(self, lines):
117 except Exception as e:
118 complain('%s.writelines: %r' % (self.name, e))
122 class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
123 """Output adapter to a Windows Console.
125 Understands how to use the win32 console API.
127 def __init__(self, console_handle, fileno, stream_name, encoding):
128 super(WinUnicodeConsoleOutput,
129 self).__init__(fileno, '<Unicode console %s>' % stream_name,
131 # Handle to use for WriteConsoleW
132 self._console_handle = console_handle
134 # Loads the necessary function.
135 # These types are available on linux but not Mac.
136 # pylint: disable=no-name-in-module,F0401
137 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
138 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
139 from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module
144 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
145 self._WriteConsoleW = WINFUNCTYPE(BOOL, HANDLE, LPWSTR, DWORD,
147 LPVOID)(('WriteConsoleW',
149 self._GetLastError = GetLastError
152 # No need to flush the console since it's immediate.
155 def write(self, text):
157 if isinstance(text, bytes):
158 # Bytestrings need to be decoded to a string before being passed
160 text = text.decode(self.encoding, 'replace')
161 remaining = len(text)
164 # There is a shorter-than-documented limitation on the length of
165 # the string passed to WriteConsoleW. See
166 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
167 retval = self._WriteConsoleW(self._console_handle, text,
168 min(remaining, 10000),
169 self._byref(n), None)
170 if retval == 0 or n.value == 0:
171 raise IOError('WriteConsoleW returned %r, n.value = %r, '
173 (retval, n.value, self._GetLastError()))
177 text = text[int(n.value):]
178 except Exception as e:
179 complain('%s.write: %r' % (self.name, e))
183 class WinUnicodeOutput(WinUnicodeOutputBase):
184 """Output adaptor to a file output on Windows.
186 If the standard FileWrite function is used, it will be encoded in the current
187 code page. WriteConsoleW() permits writing any character.
189 def __init__(self, stream, fileno, encoding):
190 super(WinUnicodeOutput,
191 self).__init__(fileno, '<Unicode redirected %s>' % stream.name,
194 self._stream = stream
202 except Exception as e:
203 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
206 def write(self, text):
208 if isinstance(text, bytes):
209 # Replace characters that cannot be printed instead of failing.
210 text = text.decode(self.encoding, 'replace')
211 # When redirecting to a file or process any \n characters will be
212 # replaced with \r\n. If the text to be printed already has \r\n
213 # line endings then \r\r\n line endings will be generated, leading
214 # to double-spacing of some output. Normalizing line endings to \n
215 # avoids this problem.
216 text = text.replace('\r\n', '\n')
217 self._stream.write(text)
218 except Exception as e:
219 complain('%s.write: %r' % (self.name, e))
223 def win_handle_is_a_console(handle):
224 """Returns True if a Windows file handle is a handle to a console."""
225 # These types are available on linux but not Mac.
226 # pylint: disable=no-name-in-module,F0401
227 from ctypes import byref, POINTER, windll, WINFUNCTYPE
228 from ctypes.wintypes import BOOL, DWORD, HANDLE
230 FILE_TYPE_CHAR = 0x0002
231 FILE_TYPE_REMOTE = 0x8000
232 INVALID_HANDLE_VALUE = DWORD(-1).value
234 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
235 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
236 ('GetConsoleMode', windll.kernel32))
237 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
238 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
240 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
241 if handle == INVALID_HANDLE_VALUE or handle is None:
243 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR
244 and GetConsoleMode(handle, byref(DWORD())))
247 def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
248 """Returns a unicode-compatible stream.
250 This function will return a direct-Console writing object only if:
251 - the file number is the expected console file number
252 - the handle the expected file handle
253 - the 'real' handle is in fact a handle to a console.
255 old_fileno = getattr(stream, 'fileno', lambda: None)()
256 if old_fileno == excepted_fileno:
257 # These types are available on linux but not Mac.
258 # pylint: disable=no-name-in-module,F0401
259 from ctypes import windll, WINFUNCTYPE
260 from ctypes.wintypes import DWORD, HANDLE
262 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
263 GetStdHandle = WINFUNCTYPE(HANDLE,
264 DWORD)(('GetStdHandle', windll.kernel32))
266 real_output_handle = GetStdHandle(DWORD(output_handle))
267 if win_handle_is_a_console(real_output_handle):
269 return WinUnicodeConsoleOutput(real_output_handle, old_fileno,
270 stream.name, encoding)
272 # It's something else. Create an auto-encoding stream.
273 return WinUnicodeOutput(stream, old_fileno, encoding)
276 def fix_win_console(encoding):
277 """Makes Unicode console output work independently of the current code page.
279 This also fixes <http://bugs.python.org/issue1602>.
280 Credit to Michael Kaplan
281 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
283 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
285 if (isinstance(sys.stdout, WinUnicodeOutputBase)
286 or isinstance(sys.stderr, WinUnicodeOutputBase)):
290 # SetConsoleCP and SetConsoleOutputCP could be used to change the code
291 # page but it's not really useful since the code here is using
292 # WriteConsoleW(). Also, changing the code page is 'permanent' to the
293 # console and needs to be reverted manually. In practice one needs to
294 # set the console font to a TTF font to be able to see all the
295 # characters but it failed for me in practice. In any case, it won't
296 # throw any exception when printing, which is the important part. -11
297 # and -12 are defined in stdio.h
298 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
299 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
300 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation
301 # is "It doesn't appear to be possible to read Unicode characters in
302 # UTF-8 mode" and this appears to be a limitation of cmd.exe.
303 except Exception as e:
304 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
309 """Fixes various encoding problems on all platforms.
311 Should be called at the very beginning of the process.
314 if sys.platform == 'win32':
315 ret &= fix_win_codec()
317 ret &= fix_default_encoding()
319 if sys.platform == 'win32':
320 encoding = sys.getdefaultencoding()
321 ret &= fix_win_console(encoding)