[lit] Clean up internal diff's encoding handling
authorJoel E. Denny <jdenny.ornl@gmail.com>
Thu, 10 Oct 2019 17:39:41 +0000 (17:39 +0000)
committerJoel E. Denny <jdenny.ornl@gmail.com>
Thu, 10 Oct 2019 17:39:41 +0000 (17:39 +0000)
As suggested by rnk at D67643#1673043, instead of reading files
multiple times until an appropriate encoding is found, read them once
as binary, and then try to decode what was read.

For python >= 3.5, don't fail when attempting to decode the
`diff_bytes` output in order to print it.

Finally, add some tests for encoding handling.

Reviewed By: rnk

Differential Revision: https://reviews.llvm.org/D68664

llvm-svn: 374389

llvm/utils/lit/lit/builtin_commands/diff.py
llvm/utils/lit/tests/Inputs/shtest-shell/diff-encodings.txt [new file with mode: 0644]
llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.bin [new file with mode: 0644]
llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.utf16 [new file with mode: 0644]
llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.utf8 [new file with mode: 0644]
llvm/utils/lit/tests/max-failures.py
llvm/utils/lit/tests/shtest-shell.py

index 885b425..562b9ac 100644 (file)
@@ -1,6 +1,7 @@
 import difflib
 import functools
 import getopt
+import locale
 import os
 import sys
 
@@ -24,37 +25,26 @@ def getDirTree(path, basedir=""):
         return path, sorted(child_trees)
 
 def compareTwoFiles(flags, filepaths):
-    compare_bytes = False
-    encoding = None
     filelines = []
     for file in filepaths:
-        try:
-            with open(file, 'r') as f:
-                filelines.append(f.readlines())
-        except UnicodeDecodeError:
-            try:
-                with io.open(file, 'r', encoding="utf-8") as f:
-                    filelines.append(f.readlines())
-                encoding = "utf-8"
-            except:
-                compare_bytes = True
-
-    if compare_bytes:
-        return compareTwoBinaryFiles(flags, filepaths)
-    else:
-        return compareTwoTextFiles(flags, filepaths, encoding)
+        with open(file, 'rb') as file_bin:
+            filelines.append(file_bin.readlines())
 
-def compareTwoBinaryFiles(flags, filepaths):
-    filelines = []
-    for file in filepaths:
-        with open(file, 'rb') as f:
-            filelines.append(f.readlines())
+    try:
+        return compareTwoTextFiles(flags, filepaths, filelines,
+                                   locale.getpreferredencoding(False))
+    except UnicodeDecodeError:
+        try:
+            return compareTwoTextFiles(flags, filepaths, filelines, "utf-8")
+        except:
+            return compareTwoBinaryFiles(flags, filepaths, filelines)
 
+def compareTwoBinaryFiles(flags, filepaths, filelines):
     exitCode = 0
     if hasattr(difflib, 'diff_bytes'):
         # python 3.5 or newer
         diffs = difflib.diff_bytes(difflib.unified_diff, filelines[0], filelines[1], filepaths[0].encode(), filepaths[1].encode())
-        diffs = [diff.decode() for diff in diffs]
+        diffs = [diff.decode(errors="backslashreplace") for diff in diffs]
     else:
         # python 2.7
         if flags.unified_diff:
@@ -68,15 +58,14 @@ def compareTwoBinaryFiles(flags, filepaths):
         exitCode = 1
     return exitCode
 
-def compareTwoTextFiles(flags, filepaths, encoding):
+def compareTwoTextFiles(flags, filepaths, filelines_bin, encoding):
     filelines = []
-    for file in filepaths:
-        if encoding is None:
-            with open(file, 'r') as f:
-                filelines.append(f.readlines())
-        else:
-            with io.open(file, 'r', encoding=encoding) as f:
-                filelines.append(f.readlines())
+    for lines_bin in filelines_bin:
+        lines = []
+        for line_bin in lines_bin:
+            line = line_bin.decode(encoding=encoding)
+            lines.append(line)
+        filelines.append(lines)
 
     exitCode = 0
     def compose2(f, g):
diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-encodings.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-encodings.txt
new file mode 100644 (file)
index 0000000..d8b9718
--- /dev/null
@@ -0,0 +1,9 @@
+# Check that diff falls back to binary mode if it cannot decode a file.
+
+# RUN: diff -u diff-in.bin diff-in.bin
+# RUN: diff -u diff-in.utf16 diff-in.bin && false || true
+# RUN: diff -u diff-in.utf8 diff-in.bin && false || true
+# RUN: diff -u diff-in.bin diff-in.utf8 && false || true
+
+# Fail so lit will print output.
+# RUN: false
diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.bin b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.bin
new file mode 100644 (file)
index 0000000..06b800b
Binary files /dev/null and b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.bin differ
diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.utf16 b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.utf16
new file mode 100644 (file)
index 0000000..d7d9fee
Binary files /dev/null and b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.utf16 differ
diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.utf8 b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.utf8
new file mode 100644 (file)
index 0000000..86e041d
--- /dev/null
@@ -0,0 +1,3 @@
+foo
+bar
+baz
index f37f73d..45224a7 100644 (file)
@@ -8,7 +8,7 @@
 #
 # END.
 
-# CHECK: Failing Tests (27)
+# CHECK: Failing Tests (28)
 # CHECK: Failing Tests (1)
 # CHECK: Failing Tests (2)
 # CHECK: error: Option '--max-failures' requires positive integer
index 3978e44..6d9b1aa 100644 (file)
 # CHECK: error: command failed with exit status: 127
 # CHECK: ***
 
+
+# CHECK: FAIL: shtest-shell :: diff-encodings.txt
+# CHECK: *** TEST 'shtest-shell :: diff-encodings.txt' FAILED ***
+
+# CHECK: $ "diff" "-u" "diff-in.bin" "diff-in.bin"
+# CHECK-NOT: error
+
+# CHECK: $ "diff" "-u" "diff-in.utf16" "diff-in.bin"
+# CHECK: # command output:
+# CHECK-NEXT: ---
+# CHECK-NEXT: +++
+# CHECK-NEXT: @@
+# CHECK-NEXT: {{^ .f.o.o.$}}
+# CHECK-NEXT: {{^-.b.a.r.$}}
+# CHECK-NEXT: {{^\+.b.a.r..}}
+# CHECK-NEXT: {{^ .b.a.z.$}}
+# CHECK: error: command failed with exit status: 1
+# CHECK: $ "true"
+
+# CHECK: $ "diff" "-u" "diff-in.utf8" "diff-in.bin"
+# CHECK: # command output:
+# CHECK-NEXT: ---
+# CHECK-NEXT: +++
+# CHECK-NEXT: @@
+# CHECK-NEXT: -foo
+# CHECK-NEXT: -bar
+# CHECK-NEXT: -baz
+# CHECK-NEXT: {{^\+.f.o.o.$}}
+# CHECK-NEXT: {{^\+.b.a.r..}}
+# CHECK-NEXT: {{^\+.b.a.z.$}}
+# CHECK: error: command failed with exit status: 1
+# CHECK: $ "true"
+
+# CHECK: $ "diff" "-u" "diff-in.bin" "diff-in.utf8"
+# CHECK: # command output:
+# CHECK-NEXT: ---
+# CHECK-NEXT: +++
+# CHECK-NEXT: @@
+# CHECK-NEXT: {{^\-.f.o.o.$}}
+# CHECK-NEXT: {{^\-.b.a.r..}}
+# CHECK-NEXT: {{^\-.b.a.z.$}}
+# CHECK-NEXT: +foo
+# CHECK-NEXT: +bar
+# CHECK-NEXT: +baz
+# CHECK: error: command failed with exit status: 1
+# CHECK: $ "true"
+
+# CHECK: $ "false"
+
+# CHECK: ***
+
+
 # CHECK: FAIL: shtest-shell :: diff-error-1.txt
 # CHECK: *** TEST 'shtest-shell :: diff-error-1.txt' FAILED ***
 # CHECK: $ "diff" "-B" "temp1.txt" "temp2.txt"
 # CHECK: PASS: shtest-shell :: sequencing-0.txt
 # CHECK: XFAIL: shtest-shell :: sequencing-1.txt
 # CHECK: PASS: shtest-shell :: valid-shell.txt
-# CHECK: Failing Tests (27)
+# CHECK: Failing Tests (28)