ls: unicode fixes
authorDenys Vlasenko <vda.linux@googlemail.com>
Sun, 31 Jan 2010 04:15:38 +0000 (05:15 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Sun, 31 Jan 2010 04:15:38 +0000 (05:15 +0100)
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
TODO_unicode
coreutils/ls.c
include/libbb.h
include/unicode.h
libbb/Kbuild
libbb/printable_string.c [new file with mode: 0644]
testsuite/ls.mk_uni_tests [new file with mode: 0644]
testsuite/ls.tests [new file with mode: 0755]

index c29fd93..b310e8d 100644 (file)
@@ -7,7 +7,7 @@ dumpleases
 Applets which may need unicode handling (more extensive than sanitizing
 of filenames in error messages):
 
-ls - uses unicode_strlen, not scrlen
+ls - work in progress
 expand, unexpand - uses unicode_strlen, not scrlen
 ash, hush through lineedit - uses unicode_strlen, not scrlen
 top - need to sanitize process args
index 6c898b7..d004ce8 100644 (file)
@@ -241,9 +241,6 @@ struct dnode {
        IF_SELINUX(security_context_t sid;)
 };
 
-static struct dnode **list_dir(const char *, unsigned *);
-static unsigned list_single(const struct dnode *);
-
 struct globals {
 #if ENABLE_FEATURE_LS_COLOR
        smallint show_color;
@@ -528,31 +525,236 @@ static void dnsort(struct dnode **dn, int size)
 #endif
 
 
-static void showfiles(struct dnode **dn, unsigned nfiles)
+static unsigned calc_name_len(const char *name)
+{
+       unsigned len;
+       uni_stat_t uni_stat;
+
+       // TODO: quote tab as \t, etc, if -Q
+       name = printable_string(&uni_stat, name);
+
+       if (!(option_mask32 & OPT_Q)) {
+               return uni_stat.unicode_width;
+       }
+
+       len = 2 + uni_stat.unicode_width;
+       while (*name) {
+               if (*name == '"' || *name == '\\') {
+                       len++;
+               }
+               name++;
+       }
+       return len;
+}
+
+
+/* Return the number of used columns.
+ * Note that only STYLE_COLUMNS uses return value.
+ * STYLE_SINGLE and STYLE_LONG don't care.
+ * coreutils 7.2 also supports:
+ * ls -b (--escape) = octal escapes (although it doesn't look like working)
+ * ls -N (--literal) = not escape at all
+ */
+static unsigned print_name(const char *name)
+{
+       unsigned len;
+       uni_stat_t uni_stat;
+
+       // TODO: quote tab as \t, etc, if -Q
+       name = printable_string(&uni_stat, name);
+
+       if (!(option_mask32 & OPT_Q)) {
+               fputs(name, stdout);
+               return uni_stat.unicode_width;
+       }
+
+       len = 2 + uni_stat.unicode_width;
+       putchar('"');
+       while (*name) {
+               if (*name == '"' || *name == '\\') {
+                       putchar('\\');
+                       len++;
+               }
+               putchar(*name++);
+       }
+       putchar('"');
+       return len;
+}
+
+/* Return the number of used columns.
+ * Note that only STYLE_COLUMNS uses return value,
+ * STYLE_SINGLE and STYLE_LONG don't care.
+ */
+static NOINLINE unsigned list_single(const struct dnode *dn)
 {
-       unsigned i, ncols, nrows, row, nc;
        unsigned column = 0;
-       unsigned nexttab = 0;
-       unsigned column_width = 0; /* for STYLE_LONG and STYLE_SINGLE not used */
+       char *lpath = lpath; /* for compiler */
+#if ENABLE_FEATURE_LS_FILETYPES || ENABLE_FEATURE_LS_COLOR
+       struct stat info;
+       char append;
+#endif
 
        /* Never happens:
-       if (dn == NULL || nfiles < 1)
-               return;
+       if (dn->fullname == NULL)
+               return 0;
        */
 
-       if (all_fmt & STYLE_LONG) {
+#if ENABLE_FEATURE_LS_FILETYPES
+       append = append_char(dn->dstat.st_mode);
+#endif
+
+       /* Do readlink early, so that if it fails, error message
+        * does not appear *inside* the "ls -l" line */
+       if (all_fmt & LIST_SYMLINK)
+               if (S_ISLNK(dn->dstat.st_mode))
+                       lpath = xmalloc_readlink_or_warn(dn->fullname);
+
+       if (all_fmt & LIST_INO)
+               column += printf("%7llu ", (long long) dn->dstat.st_ino);
+       if (all_fmt & LIST_BLOCKS)
+               column += printf("%4"OFF_FMT"u ", (off_t) (dn->dstat.st_blocks >> 1));
+       if (all_fmt & LIST_MODEBITS)
+               column += printf("%-10s ", (char *) bb_mode_string(dn->dstat.st_mode));
+       if (all_fmt & LIST_NLINKS)
+               column += printf("%4lu ", (long) dn->dstat.st_nlink);
+#if ENABLE_FEATURE_LS_USERNAME
+       if (all_fmt & LIST_ID_NAME) {
+               if (option_mask32 & OPT_g) {
+                       column += printf("%-8.8s ",
+                               get_cached_username(dn->dstat.st_uid));
+               } else {
+                       column += printf("%-8.8s %-8.8s ",
+                               get_cached_username(dn->dstat.st_uid),
+                               get_cached_groupname(dn->dstat.st_gid));
+               }
+       }
+#endif
+       if (all_fmt & LIST_ID_NUMERIC) {
+               if (option_mask32 & OPT_g)
+                       column += printf("%-8u ", (int) dn->dstat.st_uid);
+               else
+                       column += printf("%-8u %-8u ",
+                                       (int) dn->dstat.st_uid,
+                                       (int) dn->dstat.st_gid);
+       }
+       if (all_fmt & (LIST_SIZE /*|LIST_DEV*/ )) {
+               if (S_ISBLK(dn->dstat.st_mode) || S_ISCHR(dn->dstat.st_mode)) {
+                       column += printf("%4u, %3u ",
+                                       (int) major(dn->dstat.st_rdev),
+                                       (int) minor(dn->dstat.st_rdev));
+               } else {
+                       if (all_fmt & LS_DISP_HR) {
+                               column += printf("%"HUMAN_READABLE_MAX_WIDTH_STR"s ",
+                                       /* print st_size, show one fractional, use suffixes */
+                                       make_human_readable_str(dn->dstat.st_size, 1, 0)
+                               );
+                       } else {
+                               column += printf("%9"OFF_FMT"u ", (off_t) dn->dstat.st_size);
+                       }
+               }
+       }
+#if ENABLE_FEATURE_LS_TIMESTAMPS
+       if (all_fmt & (LIST_FULLTIME|LIST_DATE_TIME)) {
+               char *filetime;
+               time_t ttime = dn->dstat.st_mtime;
+               if (all_fmt & TIME_ACCESS)
+                       ttime = dn->dstat.st_atime;
+               if (all_fmt & TIME_CHANGE)
+                       ttime = dn->dstat.st_ctime;
+               filetime = ctime(&ttime);
+               /* filetime's format: "Wed Jun 30 21:49:08 1993\n" */
+               if (all_fmt & LIST_FULLTIME)
+                       column += printf("%.24s ", filetime);
+               else { /* LIST_DATE_TIME */
+                       /* current_time_t ~== time(NULL) */
+                       time_t age = current_time_t - ttime;
+                       printf("%.6s ", filetime + 4); /* "Jun 30" */
+                       if (age < 3600L * 24 * 365 / 2 && age > -15 * 60) {
+                               /* hh:mm if less than 6 months old */
+                               printf("%.5s ", filetime + 11);
+                       } else { /* year. buggy if year > 9999 ;) */
+                               printf(" %.4s ", filetime + 20);
+                       }
+                       column += 13;
+               }
+       }
+#endif
+#if ENABLE_SELINUX
+       if (all_fmt & LIST_CONTEXT) {
+               column += printf("%-32s ", dn->sid ? dn->sid : "unknown");
+               freecon(dn->sid);
+       }
+#endif
+       if (all_fmt & LIST_FILENAME) {
+#if ENABLE_FEATURE_LS_COLOR
+               if (show_color) {
+                       info.st_mode = 0; /* for fgcolor() */
+                       lstat(dn->fullname, &info);
+                       printf("\033[%u;%um", bold(info.st_mode),
+                                       fgcolor(info.st_mode));
+               }
+#endif
+               column += print_name(dn->name);
+               if (show_color) {
+                       printf("\033[0m");
+               }
+       }
+       if (all_fmt & LIST_SYMLINK) {
+               if (S_ISLNK(dn->dstat.st_mode) && lpath) {
+                       printf(" -> ");
+#if ENABLE_FEATURE_LS_FILETYPES || ENABLE_FEATURE_LS_COLOR
+#if ENABLE_FEATURE_LS_COLOR
+                       info.st_mode = 0; /* for fgcolor() */
+#endif
+                       if (stat(dn->fullname, &info) == 0) {
+                               append = append_char(info.st_mode);
+                       }
+#endif
+#if ENABLE_FEATURE_LS_COLOR
+                       if (show_color) {
+                               printf("\033[%u;%um", bold(info.st_mode),
+                                          fgcolor(info.st_mode));
+                       }
+#endif
+                       column += print_name(lpath) + 4;
+                       if (show_color) {
+                               printf("\033[0m");
+                       }
+                       free(lpath);
+               }
+       }
+#if ENABLE_FEATURE_LS_FILETYPES
+       if (all_fmt & LIST_FILETYPE) {
+               if (append) {
+                       putchar(append);
+                       column++;
+               }
+       }
+#endif
+
+       return column;
+}
+
+static void showfiles(struct dnode **dn, unsigned nfiles)
+{
+       unsigned i, ncols, nrows, row, nc;
+       unsigned column = 0;
+       unsigned nexttab = 0;
+       unsigned column_width = 0; /* used only by STYLE_COLUMNS */
+
+       if (all_fmt & STYLE_LONG) { /* STYLE_LONG or STYLE_SINGLE */
                ncols = 1;
        } else {
                /* find the longest file name, use that as the column width */
                for (i = 0; dn[i]; i++) {
-                       int len = unicode_strlen(dn[i]->name);
+                       int len = calc_name_len(dn[i]->name);
                        if (column_width < len)
                                column_width = len;
                }
                column_width += tabstops +
                        IF_SELINUX( ((all_fmt & LIST_CONTEXT) ? 33 : 0) + )
-                                    ((all_fmt & LIST_INO) ? 8 : 0) +
-                                    ((all_fmt & LIST_BLOCKS) ? 5 : 0);
+                               ((all_fmt & LIST_INO) ? 8 : 0) +
+                               ((all_fmt & LIST_BLOCKS) ? 5 : 0);
                ncols = (int) (terminal_width / column_width);
        }
 
@@ -618,6 +820,8 @@ static off_t calculate_blocks(struct dnode **dn)
 #endif
 
 
+static struct dnode **list_dir(const char *, unsigned *);
+
 static void showdirs(struct dnode **dn, int first)
 {
        unsigned nfiles;
@@ -733,188 +937,6 @@ static struct dnode **list_dir(const char *path, unsigned *nfiles_p)
 }
 
 
-static int print_name(const char *name)
-{
-       if (option_mask32 & OPT_Q) {
-#if ENABLE_FEATURE_ASSUME_UNICODE
-               unsigned len = 2 + unicode_strlen(name);
-#else
-               unsigned len = 2;
-#endif
-               putchar('"');
-               while (*name) {
-                       if (*name == '"') {
-                               putchar('\\');
-                               len++;
-                       }
-                       putchar(*name++);
-                       if (!ENABLE_FEATURE_ASSUME_UNICODE)
-                               len++;
-               }
-               putchar('"');
-               return len;
-       }
-       /* No -Q: */
-#if ENABLE_FEATURE_ASSUME_UNICODE
-       fputs(name, stdout);
-       return unicode_strlen(name);
-#else
-       return printf("%s", name);
-#endif
-}
-
-
-static NOINLINE unsigned list_single(const struct dnode *dn)
-{
-       unsigned column = 0;
-       char *lpath = lpath; /* for compiler */
-#if ENABLE_FEATURE_LS_FILETYPES || ENABLE_FEATURE_LS_COLOR
-       struct stat info;
-       char append;
-#endif
-
-       /* Never happens:
-       if (dn->fullname == NULL)
-               return 0;
-       */
-
-#if ENABLE_FEATURE_LS_FILETYPES
-       append = append_char(dn->dstat.st_mode);
-#endif
-
-       /* Do readlink early, so that if it fails, error message
-        * does not appear *inside* the "ls -l" line */
-       if (all_fmt & LIST_SYMLINK)
-               if (S_ISLNK(dn->dstat.st_mode))
-                       lpath = xmalloc_readlink_or_warn(dn->fullname);
-
-       if (all_fmt & LIST_INO)
-               column += printf("%7llu ", (long long) dn->dstat.st_ino);
-       if (all_fmt & LIST_BLOCKS)
-               column += printf("%4"OFF_FMT"u ", (off_t) (dn->dstat.st_blocks >> 1));
-       if (all_fmt & LIST_MODEBITS)
-               column += printf("%-10s ", (char *) bb_mode_string(dn->dstat.st_mode));
-       if (all_fmt & LIST_NLINKS)
-               column += printf("%4lu ", (long) dn->dstat.st_nlink);
-#if ENABLE_FEATURE_LS_USERNAME
-       if (all_fmt & LIST_ID_NAME) {
-               if (option_mask32 & OPT_g) {
-                       column += printf("%-8.8s ",
-                               get_cached_username(dn->dstat.st_uid));
-               } else {
-                       column += printf("%-8.8s %-8.8s ",
-                               get_cached_username(dn->dstat.st_uid),
-                               get_cached_groupname(dn->dstat.st_gid));
-               }
-       }
-#endif
-       if (all_fmt & LIST_ID_NUMERIC) {
-               if (option_mask32 & OPT_g)
-                       column += printf("%-8u ", (int) dn->dstat.st_uid);
-               else
-                       column += printf("%-8u %-8u ",
-                                       (int) dn->dstat.st_uid,
-                                       (int) dn->dstat.st_gid);
-       }
-       if (all_fmt & (LIST_SIZE /*|LIST_DEV*/ )) {
-               if (S_ISBLK(dn->dstat.st_mode) || S_ISCHR(dn->dstat.st_mode)) {
-                       column += printf("%4u, %3u ",
-                                       (int) major(dn->dstat.st_rdev),
-                                       (int) minor(dn->dstat.st_rdev));
-               } else {
-                       if (all_fmt & LS_DISP_HR) {
-                               column += printf("%"HUMAN_READABLE_MAX_WIDTH_STR"s ",
-                                       /* print st_size, show one fractional, use suffixes */
-                                       make_human_readable_str(dn->dstat.st_size, 1, 0)
-                               );
-                       } else {
-                               column += printf("%9"OFF_FMT"u ", (off_t) dn->dstat.st_size);
-                       }
-               }
-       }
-#if ENABLE_FEATURE_LS_TIMESTAMPS
-       if (all_fmt & (LIST_FULLTIME|LIST_DATE_TIME)) {
-               char *filetime;
-               time_t ttime = dn->dstat.st_mtime;
-               if (all_fmt & TIME_ACCESS)
-                       ttime = dn->dstat.st_atime;
-               if (all_fmt & TIME_CHANGE)
-                       ttime = dn->dstat.st_ctime;
-               filetime = ctime(&ttime);
-               /* filetime's format: "Wed Jun 30 21:49:08 1993\n" */
-               if (all_fmt & LIST_FULLTIME)
-                       column += printf("%.24s ", filetime);
-               else { /* LIST_DATE_TIME */
-                       /* current_time_t ~== time(NULL) */
-                       time_t age = current_time_t - ttime;
-                       printf("%.6s ", filetime + 4); /* "Jun 30" */
-                       if (age < 3600L * 24 * 365 / 2 && age > -15 * 60) {
-                               /* hh:mm if less than 6 months old */
-                               printf("%.5s ", filetime + 11);
-                       } else { /* year. buggy if year > 9999 ;) */
-                               printf(" %.4s ", filetime + 20);
-                       }
-                       column += 13;
-               }
-       }
-#endif
-#if ENABLE_SELINUX
-       if (all_fmt & LIST_CONTEXT) {
-               column += printf("%-32s ", dn->sid ? dn->sid : "unknown");
-               freecon(dn->sid);
-       }
-#endif
-       if (all_fmt & LIST_FILENAME) {
-#if ENABLE_FEATURE_LS_COLOR
-               if (show_color) {
-                       info.st_mode = 0; /* for fgcolor() */
-                       lstat(dn->fullname, &info);
-                       printf("\033[%u;%um", bold(info.st_mode),
-                                       fgcolor(info.st_mode));
-               }
-#endif
-               column += print_name(dn->name);
-               if (show_color) {
-                       printf("\033[0m");
-               }
-       }
-       if (all_fmt & LIST_SYMLINK) {
-               if (S_ISLNK(dn->dstat.st_mode) && lpath) {
-                       printf(" -> ");
-#if ENABLE_FEATURE_LS_FILETYPES || ENABLE_FEATURE_LS_COLOR
-#if ENABLE_FEATURE_LS_COLOR
-                       info.st_mode = 0; /* for fgcolor() */
-#endif
-                       if (stat(dn->fullname, &info) == 0) {
-                               append = append_char(info.st_mode);
-                       }
-#endif
-#if ENABLE_FEATURE_LS_COLOR
-                       if (show_color) {
-                               printf("\033[%u;%um", bold(info.st_mode),
-                                          fgcolor(info.st_mode));
-                       }
-#endif
-                       column += print_name(lpath) + 4;
-                       if (show_color) {
-                               printf("\033[0m");
-                       }
-                       free(lpath);
-               }
-       }
-#if ENABLE_FEATURE_LS_FILETYPES
-       if (all_fmt & LIST_FILETYPE) {
-               if (append) {
-                       putchar(append);
-                       column++;
-               }
-       }
-#endif
-
-       return column;
-}
-
-
 int ls_main(int argc UNUSED_PARAM, char **argv)
 {
        struct dnode **dnd;
index 73aea40..a86d644 100644 (file)
@@ -577,11 +577,6 @@ char *strncpy_IFNAMSIZ(char *dst, const char *src) FAST_FUNC;
  * But potentially slow, don't use in one-billion-times loops */
 int bb_putchar(int ch) FAST_FUNC;
 char *xasprintf(const char *format, ...) __attribute__ ((format(printf, 1, 2))) FAST_FUNC RETURNS_MALLOC;
-/* Prints unprintable chars ch as ^C or M-c to file
- * (M-c is used only if ch is ORed with PRINTABLE_META),
- * else it is printed as-is (except for ch = 0x9b) */
-enum { PRINTABLE_META = 0x100 };
-void fputc_printable(int ch, FILE *file) FAST_FUNC;
 // gcc-4.1.1 still isn't good enough at optimizing it
 // (+200 bytes compared to macro)
 //static ALWAYS_INLINE
@@ -594,6 +589,20 @@ void fputc_printable(int ch, FILE *file) FAST_FUNC;
 #define NOT_LONE_CHAR(s,c) ((s)[0] != (c) || (s)[1])
 #define DOT_OR_DOTDOT(s) ((s)[0] == '.' && (!(s)[1] || ((s)[1] == '.' && !(s)[2])))
 
+typedef struct uni_stat_t {
+       unsigned byte_count;
+       unsigned unicode_count;
+       unsigned unicode_width;
+} uni_stat_t;
+/* Returns a string with unprintable chars replaced by '?' or
+ * SUBST_WCHAR. This function is unicode-aware. */
+const char* FAST_FUNC printable_string(uni_stat_t *stats, const char *str);
+/* Prints unprintable char ch as ^C or M-c to file
+ * (M-c is used only if ch is ORed with PRINTABLE_META),
+ * else it is printed as-is (except for ch = 0x9b) */
+enum { PRINTABLE_META = 0x100 };
+void fputc_printable(int ch, FILE *file) FAST_FUNC;
+
 /* dmalloc will redefine these to it's own implementation. It is safe
  * to have the prototypes here unconditionally.  */
 void *malloc_or_warn(size_t size) FAST_FUNC RETURNS_MALLOC;
index f32e565..25ef740 100644 (file)
@@ -23,11 +23,6 @@ size_t FAST_FUNC unicode_strlen(const char *string);
 enum {
        UNI_FLAG_PAD = (1 << 0),
 };
-typedef struct uni_stat_t {
-       unsigned byte_count;
-       unsigned unicode_count;
-       unsigned unicode_width;
-} uni_stat_t;
 //UNUSED: unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src);
 //UNUSED: char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char *src, unsigned width, int flags);
 char* FAST_FUNC unicode_conv_to_printable(uni_stat_t *stats, const char *src);
index 243626d..7e79310 100644 (file)
@@ -73,6 +73,7 @@ lib-y += perror_nomsg_and_die.o
 lib-y += pidfile.o
 lib-y += platform.o
 lib-y += printable.o
+lib-y += printable_string.o
 lib-y += print_flags.o
 lib-y += process_escape_sequence.o
 lib-y += procps.o
diff --git a/libbb/printable_string.c b/libbb/printable_string.c
new file mode 100644 (file)
index 0000000..47565de
--- /dev/null
@@ -0,0 +1,65 @@
+/* vi: set sw=4 ts=4: */
+/*
+ * Unicode support routines.
+ *
+ * Copyright (C) 2010 Denys Vlasenko
+ *
+ * Licensed under GPL version 2, see file LICENSE in this tarball for details.
+ */
+#include "libbb.h"
+#include "unicode.h"
+
+const char* FAST_FUNC printable_string(uni_stat_t *stats, const char *str)
+{
+       static char *saved[4];
+       static unsigned cur_saved; /* = 0 */
+
+       char *dst;
+       const char *s;
+
+       s = str;
+       while (1) {
+               unsigned char c = *s;
+               if (c == '\0') {
+                       /* 99+% of inputs do not need conversion */
+                       if (stats) {
+                               stats->byte_count = (s - str);
+                               stats->unicode_count = (s - str);
+                               stats->unicode_width = (s - str);
+                       }
+                       return str;
+               }
+               if (c < ' ')
+                       break;
+               if (c >= 0x7f)
+                       break;
+               s++;
+       }
+
+#if ENABLE_FEATURE_ASSUME_UNICODE
+       dst = unicode_conv_to_printable(stats, str);
+#else
+       {
+               char *d = dst = xstrdup(str);
+               while (1) {
+                       unsigned char c = *d;
+                       if (c == '\0')
+                               break;
+                       if (c < ' ' || c >= 0x7f)
+                               *d = '?';
+                       d++;
+               }
+               if (stats) {
+                       stats->byte_count = (d - dst);
+                       stats->unicode_count = (d - dst);
+                       stats->unicode_width = (d - dst);
+               }
+       }
+#endif
+
+       free(saved[cur_saved]);
+       saved[cur_saved] = dst;
+       cur_saved = (cur_saved + 1) & (ARRAY_SIZE(saved)-1);
+
+       return dst;
+}
diff --git a/testsuite/ls.mk_uni_tests b/testsuite/ls.mk_uni_tests
new file mode 100644 (file)
index 0000000..da0c29f
--- /dev/null
@@ -0,0 +1,111 @@
+# DO NOT EDIT THIS FILE! MOST TEXT EDITORS WILL DAMAGE IT!
+>'0001_1__Some_correct_UTF-8_text___________________________________________|'
+>'0002_2__Boundary_condition_test_cases_____________________________________|'
+>'0003_2.1__First_possible_sequence_of_a_certain_length_____________________|'
+>'0004_2.1.2__2_bytes__U-00000080_:________"\80"______________________________|'
+>'0005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________|'
+>'0006_2.1.4__4_bytes__U-00010000_:________"𐀀"______________________________|'
+>'0007_2.1.5__5_bytes__U-00200000_:________""______________________________|'
+>'0008_2.1.6__6_bytes__U-04000000_:________""______________________________|'
+>'0009_2.2__Last_possible_sequence_of_a_certain_length______________________|'
+>'0010_2.2.1__1_byte___U-0000007F_:________"\7f"______________________________|'
+>'0011_2.2.2__2_bytes__U-000007FF_:________"߿"______________________________|'
+>'0012_2.2.3__3_bytes__U-0000FFFF_:________"￿"______________________________|'
+>'0013_2.2.4__4_bytes__U-001FFFFF_:________""______________________________|'
+>'0014_2.2.5__5_bytes__U-03FFFFFF_:________""______________________________|'
+>'0015_2.2.6__6_bytes__U-7FFFFFFF_:________""______________________________|'
+>'0016_2.3__Other_boundary_conditions_______________________________________|'
+>'0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"퟿"___________________________________|'
+>'0018_2.3.2__U-0000E000_=_ee_80_80_=_""___________________________________|'
+>'0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"�"___________________________________|'
+>'0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"􏿿"________________________________|'
+>'0021_2.3.5__U-00110000_=_f4_90_80_80_=_""________________________________|'
+>'0022_3__Malformed_sequences_______________________________________________|'
+>'0023_3.1__Unexpected_continuation_bytes___________________________________|'
+>'0024_3.1.1__First_continuation_byte_0x80:_"\80"_____________________________|'
+>'0025_3.1.2__Last__continuation_byte_0xbf:_"¿"_____________________________|'
+>'0026_3.1.3__2_continuation_bytes:_"\80¿"____________________________________|'
+>'0027_3.1.4__3_continuation_bytes:_"\80¿\80"___________________________________|'
+>'0028_3.1.5__4_continuation_bytes:_"\80¿\80¿"__________________________________|'
+>'0029_3.1.6__5_continuation_bytes:_"\80¿\80¿\80"_________________________________|'
+>'0030_3.1.7__6_continuation_bytes:_"\80¿\80¿\80¿"________________________________|'
+>'0031_3.1.8__7_continuation_bytes:_"\80¿\80¿\80¿\80"_______________________________|'
+>'0032_3.1.9__Sequence_of_all_64_possible_continuation_bytes__0x80-0xbf_:___|'
+>'0033____"\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f_________________________________________________|'
+>'0034_____\90\91\92\93\94\95\96\97\98\99\9a\9b\9c\9d\9e\9f_________________________________________________|'
+>'0035_____ ¡¢£¤¥¦§¨©ª«¬­®¯_________________________________________________|'
+>'0036_____°±²³´µ¶·¸¹º»¼½¾¿"________________________________________________|'
+>'0037_3.2__Lonely_start_characters_________________________________________|'
+>'0038_3.2.1__All_32_first_bytes_of_2-byte_sequences__0xc0-0xdf_,___________|'
+>'0039________each_followed_by_a_space_character:___________________________|'
+>'0040____"À_Á_Â_Ã_Ä_Å_Æ_Ç_È_É_Ê_Ë_Ì_Í_Î_Ï__________________________________|'
+>'0041_____Ð_Ñ_Ò_Ó_Ô_Õ_Ö_×_Ø_Ù_Ú_Û_Ü_Ý_Þ_ß_"________________________________|'
+>'0042_3.2.2__All_16_first_bytes_of_3-byte_sequences__0xe0-0xef_,___________|'
+>'0043________each_followed_by_a_space_character:___________________________|'
+>'0044____"à_á_â_ã_ä_å_æ_ç_è_é_ê_ë_ì_í_î_ï_"________________________________|'
+>'0045_3.2.3__All_8_first_bytes_of_4-byte_sequences__0xf0-0xf7_,____________|'
+>'0046________each_followed_by_a_space_character:___________________________|'
+>'0047____"ð_ñ_ò_ó_ô_õ_ö_÷_"________________________________________________|'
+>'0048_3.2.4__All_4_first_bytes_of_5-byte_sequences__0xf8-0xfb_,____________|'
+>'0049________each_followed_by_a_space_character:___________________________|'
+>'0050____"ø_ù_ú_û_"________________________________________________________|'
+>'0051_3.2.5__All_2_first_bytes_of_6-byte_sequences__0xfc-0xfd_,____________|'
+>'0052________each_followed_by_a_space_character:___________________________|'
+>'0053____"ü_ý_"____________________________________________________________|'
+>'0054_3.3__Sequences_with_last_continuation_byte_missing___________________|'
+>'0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"À"______|'
+>'0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"à\80"______|'
+>'0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"ð\80\80"______|'
+>'0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"ø\80\80\80"______|'
+>'0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"ü\80\80\80\80"______|'
+>'0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"ß"______|'
+>'0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"ï¿"______|'
+>'0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"÷¿¿"______|'
+>'0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"û¿¿¿"______|'
+>'0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"ý¿¿¿¿"______|'
+>'0065_3.4__Concatenation_of_incomplete_sequences___________________________|'
+>'0066____"Àà\80ð\80\80ø\80\80\80ü\80\80\80\80ßï¿÷¿¿û¿¿¿ý¿¿¿¿"______________________________________________________|'
+>'0067_3.5__Impossible_bytes________________________________________________|'
+>'0068_3.5.1__fe_=_"þ"______________________________________________________|'
+>'0069_3.5.2__ff_=_"ÿ"______________________________________________________|'
+>'0070_3.5.3__fe_fe_ff_ff_=_"þþÿÿ"__________________________________________|'
+>'0071_4__Overlong_sequences________________________________________________|'
+>'0072_4.1__Examples_of_an_overlong_ASCII_character_________________________|'
+>'0073_4.1.1_U+002F_=_c0_af_____________=_"À¯"_______________________________|'
+>'0074_4.1.2_U+002F_=_e0_80_af__________=_"à\80¯"_______________________________|'
+>'0075_4.1.3_U+002F_=_f0_80_80_af_______=_"ð\80\80¯"_______________________________|'
+>'0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"ø\80\80\80¯"_______________________________|'
+>'0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"ü\80\80\80\80¯"_______________________________|'
+>'0078_4.2__Maximum_overlong_sequences______________________________________|'
+>'0079_4.2.1__U-0000007F_=_c1_bf_____________=_"Á¿"__________________________|'
+>'0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"à\9f¿"__________________________|'
+>'0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"ð\8f¿¿"__________________________|'
+>'0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"ø\87¿¿¿"__________________________|'
+>'0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"ü\83¿¿¿¿"__________________________|'
+>'0084_4.3__Overlong_representation_of_the_NUL_character____________________|'
+>'0085_4.3.1__U+0000_=_c0_80_____________=_"À\80"______________________________|'
+>'0086_4.3.2__U+0000_=_e0_80_80__________=_"à\80\80"______________________________|'
+>'0087_4.3.3__U+0000_=_f0_80_80_80_______=_"ð\80\80\80"______________________________|'
+>'0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"ø\80\80\80\80"______________________________|'
+>'0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"ü\80\80\80\80\80"______________________________|'
+>'0090_5__Illegal_code_positions____________________________________________|'
+>'0091_5.1_Single_UTF-16_surrogates_________________________________________|'
+>'0092_5.1.1__U+D800_=_ed_a0_80_=_""_______________________________________|'
+>'0093_5.1.2__U+DB7F_=_ed_ad_bf_=_""_______________________________________|'
+>'0094_5.1.3__U+DB80_=_ed_ae_80_=_""_______________________________________|'
+>'0095_5.1.4__U+DBFF_=_ed_af_bf_=_""_______________________________________|'
+>'0096_5.1.5__U+DC00_=_ed_b0_80_=_""_______________________________________|'
+>'0097_5.1.6__U+DF80_=_ed_be_80_=_""_______________________________________|'
+>'0098_5.1.7__U+DFFF_=_ed_bf_bf_=_""_______________________________________|'
+>'0099_5.2_Paired_UTF-16_surrogates_________________________________________|'
+>'0100_5.2.1__U+D800_U+DC00_=_ed_a0_80_ed_b0_80_=_""______________________|'
+>'0101_5.2.2__U+D800_U+DFFF_=_ed_a0_80_ed_bf_bf_=_""______________________|'
+>'0102_5.2.3__U+DB7F_U+DC00_=_ed_ad_bf_ed_b0_80_=_""______________________|'
+>'0103_5.2.4__U+DB7F_U+DFFF_=_ed_ad_bf_ed_bf_bf_=_""______________________|'
+>'0104_5.2.5__U+DB80_U+DC00_=_ed_ae_80_ed_b0_80_=_""______________________|'
+>'0105_5.2.6__U+DB80_U+DFFF_=_ed_ae_80_ed_bf_bf_=_""______________________|'
+>'0106_5.2.7__U+DBFF_U+DC00_=_ed_af_bf_ed_b0_80_=_""______________________|'
+>'0107_5.2.8__U+DBFF_U+DFFF_=_ed_af_bf_ed_bf_bf_=_""______________________|'
+>'0108_5.3_Other_illegal_code_positions_____________________________________|'
+>'0109_5.3.1__U+FFFE_=_ef_bf_be_=_"￾"_______________________________________|'
+>'0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"￿"_______________________________________|'
diff --git a/testsuite/ls.tests b/testsuite/ls.tests
new file mode 100755 (executable)
index 0000000..b0c5da7
--- /dev/null
@@ -0,0 +1,136 @@
+#!/bin/sh
+# Copyright 2010 by Denys Vlasenko
+# Licensed under GPL v2, see file LICENSE for details.
+
+. ./testing.sh
+
+test -f "$bindir/.config" && . "$bindir/.config"
+
+rm -rf ls.testdir >/dev/null
+mkdir ls.testdir || exit 1
+
+# testing "test name" "command" "expected result" "file input" "stdin"
+
+# The test isn't passing correctly now - all | chars should line up
+# perfectly in the correctly passed test.
+test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
+&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
+&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
+&& testing "ls unicode test" \
+"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \
+'0001_1__Some_correct_UTF-8_text___________________________________________|
+0002_2__Boundary_condition_test_cases_____________________________________|
+0003_2.1__First_possible_sequence_of_a_certain_length_____________________|
+0004_2.1.2__2_bytes__U-00000080_:________"?"______________________________|
+0005_2.1.3__3_bytes__U-00000800_:________"?"______________________________|
+0006_2.1.4__4_bytes__U-00010000_:________"?"______________________________|
+0007_2.1.5__5_bytes__U-00200000_:________"?"______________________________|
+0008_2.1.6__6_bytes__U-04000000_:________"?"______________________________|
+0009_2.2__Last_possible_sequence_of_a_certain_length______________________|
+0010_2.2.1__1_byte___U-0000007F_:________"?"______________________________|
+0011_2.2.2__2_bytes__U-000007FF_:________"?"______________________________|
+0012_2.2.3__3_bytes__U-0000FFFF_:________"?"______________________________|
+0013_2.2.4__4_bytes__U-001FFFFF_:________"?"______________________________|
+0014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________|
+0015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________|
+0016_2.3__Other_boundary_conditions_______________________________________|
+0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________|
+0018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________|
+0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________|
+0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________|
+0021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________|
+0022_3__Malformed_sequences_______________________________________________|
+0023_3.1__Unexpected_continuation_bytes___________________________________|
+0024_3.1.1__First_continuation_byte_0x80:_"?"_____________________________|
+0025_3.1.2__Last__continuation_byte_0xbf:_"?"_____________________________|
+0026_3.1.3__2_continuation_bytes:_"??"____________________________________|
+0027_3.1.4__3_continuation_bytes:_"???"___________________________________|
+0028_3.1.5__4_continuation_bytes:_"????"__________________________________|
+0029_3.1.6__5_continuation_bytes:_"?????"_________________________________|
+0030_3.1.7__6_continuation_bytes:_"??????"________________________________|
+0031_3.1.8__7_continuation_bytes:_"???????"_______________________________|
+0032_3.1.9__Sequence_of_all_64_possible_continuation_bytes__0x80-0xbf_:___|
+0033____"????????????????_________________________________________________|
+0034_____????????????????_________________________________________________|
+0035_____????????????????_________________________________________________|
+0036_____????????????????"________________________________________________|
+0037_3.2__Lonely_start_characters_________________________________________|
+0038_3.2.1__All_32_first_bytes_of_2-byte_sequences__0xc0-0xdf_,___________|
+0039________each_followed_by_a_space_character:___________________________|
+0040____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?__________________________________|
+0041_____?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________|
+0042_3.2.2__All_16_first_bytes_of_3-byte_sequences__0xe0-0xef_,___________|
+0043________each_followed_by_a_space_character:___________________________|
+0044____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________|
+0045_3.2.3__All_8_first_bytes_of_4-byte_sequences__0xf0-0xf7_,____________|
+0046________each_followed_by_a_space_character:___________________________|
+0047____"?_?_?_?_?_?_?_?_"________________________________________________|
+0048_3.2.4__All_4_first_bytes_of_5-byte_sequences__0xf8-0xfb_,____________|
+0049________each_followed_by_a_space_character:___________________________|
+0050____"?_?_?_?_"________________________________________________________|
+0051_3.2.5__All_2_first_bytes_of_6-byte_sequences__0xfc-0xfd_,____________|
+0052________each_followed_by_a_space_character:___________________________|
+0053____"?_?_"____________________________________________________________|
+0054_3.3__Sequences_with_last_continuation_byte_missing___________________|
+0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
+0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"??"______|
+0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"???"______|
+0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"????"______|
+0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?????"______|
+0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______|
+0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"??"______|
+0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"???"______|
+0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"????"______|
+0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?????"______|
+0065_3.4__Concatenation_of_incomplete_sequences___________________________|
+0066____"??????????????????????????????"______________________________________________________|
+0067_3.5__Impossible_bytes________________________________________________|
+0068_3.5.1__fe_=_"?"______________________________________________________|
+0069_3.5.2__ff_=_"?"______________________________________________________|
+0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________|
+0071_4__Overlong_sequences________________________________________________|
+0072_4.1__Examples_of_an_overlong_ASCII_character_________________________|
+0073_4.1.1_U+002F_=_c0_af_____________=_"??"_______________________________|
+0074_4.1.2_U+002F_=_e0_80_af__________=_"???"_______________________________|
+0075_4.1.3_U+002F_=_f0_80_80_af_______=_"????"_______________________________|
+0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?????"_______________________________|
+0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"??????"_______________________________|
+0078_4.2__Maximum_overlong_sequences______________________________________|
+0079_4.2.1__U-0000007F_=_c1_bf_____________=_"??"__________________________|
+0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________|
+0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________|
+0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________|
+0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________|
+0084_4.3__Overlong_representation_of_the_NUL_character____________________|
+0085_4.3.1__U+0000_=_c0_80_____________=_"??"______________________________|
+0086_4.3.2__U+0000_=_e0_80_80__________=_"???"______________________________|
+0087_4.3.3__U+0000_=_f0_80_80_80_______=_"????"______________________________|
+0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?????"______________________________|
+0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"??????"______________________________|
+0090_5__Illegal_code_positions____________________________________________|
+0091_5.1_Single_UTF-16_surrogates_________________________________________|
+0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________|
+0093_5.1.2__U+DB7F_=_ed_ad_bf_=_"?"_______________________________________|
+0094_5.1.3__U+DB80_=_ed_ae_80_=_"?"_______________________________________|
+0095_5.1.4__U+DBFF_=_ed_af_bf_=_"?"_______________________________________|
+0096_5.1.5__U+DC00_=_ed_b0_80_=_"?"_______________________________________|
+0097_5.1.6__U+DF80_=_ed_be_80_=_"?"_______________________________________|
+0098_5.1.7__U+DFFF_=_ed_bf_bf_=_"?"_______________________________________|
+0099_5.2_Paired_UTF-16_surrogates_________________________________________|
+0100_5.2.1__U+D800_U+DC00_=_ed_a0_80_ed_b0_80_=_"??"______________________|
+0101_5.2.2__U+D800_U+DFFF_=_ed_a0_80_ed_bf_bf_=_"??"______________________|
+0102_5.2.3__U+DB7F_U+DC00_=_ed_ad_bf_ed_b0_80_=_"??"______________________|
+0103_5.2.4__U+DB7F_U+DFFF_=_ed_ad_bf_ed_bf_bf_=_"??"______________________|
+0104_5.2.5__U+DB80_U+DC00_=_ed_ae_80_ed_b0_80_=_"??"______________________|
+0105_5.2.6__U+DB80_U+DFFF_=_ed_ae_80_ed_bf_bf_=_"??"______________________|
+0106_5.2.7__U+DBFF_U+DC00_=_ed_af_bf_ed_b0_80_=_"??"______________________|
+0107_5.2.8__U+DBFF_U+DFFF_=_ed_af_bf_ed_bf_bf_=_"??"______________________|
+0108_5.3_Other_illegal_code_positions_____________________________________|
+0109_5.3.1__U+FFFE_=_ef_bf_be_=_"?"_______________________________________|
+0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________|
+' "" ""
+
+# Clean up
+rm -rf ls.testdir 2>/dev/null
+
+exit $FAILCOUNT