libbb/lineedit: add support for preserving "broken" (non-unicode) chars

author Tomas Heinrich <heinrich.tomas@gmail.com>

Thu, 29 Apr 2010 11:43:39 +0000 (13:43 +0200)

committer Denys Vlasenko <vda.linux@googlemail.com>

Thu, 29 Apr 2010 11:43:39 +0000 (13:43 +0200)
author Tomas Heinrich <heinrich.tomas@gmail.com>
Thu, 29 Apr 2010 11:43:39 +0000 (13:43 +0200)
committer Denys Vlasenko <vda.linux@googlemail.com>
Thu, 29 Apr 2010 11:43:39 +0000 (13:43 +0200)
diff --git a/Config.in b/Config.in

index 40af911..a5d2003 100644 (file)
--- a/Config.in
+++ b/Config.in
@@ -223,6 +223,17 @@ config UNICODE_NEUTRAL_TABLE
           With this option on, more extensive (and bigger) table
           of neutral chars will be used.
  
+config UNICODE_PRESERVE_BROKEN
+       bool "Make it possible to enter sequences of chars which are not Unicode"
+       default n
+       depends on UNICODE_SUPPORT
+       help
+         With this option on, invalid UTF-8 bytes are not substituted
+         with the selected substitution character.
+         For example, this means that entering 'l', 's', ' ', 0xff, [Enter]
+         at shell prompt will list file named 0xff (single char name
+         with char value 255), not file named '?'.
+
  config LONG_OPTS
         bool "Support for --long-options"
         default y
diff --git a/libbb/lineedit.c b/libbb/lineedit.c

index dc90846..622f9dd 100644 (file)
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -68,7 +68,7 @@
  
  #undef CHAR_T
  #if ENABLE_UNICODE_SUPPORT
-# define BB_NUL L'\0'
+# define BB_NUL ((wchar_t)0)
  # define CHAR_T wchar_t
  static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
  # if ENABLE_FEATURE_EDITING_VI
@@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); }
  #endif
  
  
+# if ENABLE_UNICODE_PRESERVE_BROKEN
+#  define unicode_mark_inv_wchar(wc)   ((wc) | 0x20000000)
+#  define unicode_is_inv_wchar(wc)     ((wc) & 0x20000000)
+# else
+#  define unicode_is_inv_wchar(wc)     0
+# endif
+
+
  enum {
         /* We use int16_t for positions, need to limit line len */
         MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0
@@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize)
         ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
         if (len < 0)
                 len = 0;
-       command_ps[len] = L'\0';
+       command_ps[len] = 0;
         return len;
  }
-static size_t save_string(char *dst, int maxsize)
+static unsigned save_string(char *dst, unsigned maxsize)
  {
+#if !ENABLE_UNICODE_PRESERVE_BROKEN
         ssize_t len = wcstombs(dst, command_ps, maxsize - 1);
         if (len < 0)
                 len = 0;
         dst[len] = '\0';
         return len;
+#else
+       unsigned dstpos = 0;
+       unsigned srcpos = 0;
+
+       maxsize--;
+       while (dstpos < maxsize) {
+               wchar_t wc;
+               int n = srcpos;
+               while ((wc = command_ps[srcpos]) != 0
+                   && !unicode_is_inv_wchar(wc)
+               ) {
+                       srcpos++;
+               }
+               command_ps[srcpos] = 0;
+               n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos);
+               if (n < 0) /* should not happen */
+                       break;
+               dstpos += n;
+               if (wc == 0) /* usually is */
+                       break;
+               /* We do have invalid byte here! */
+               command_ps[srcpos] = wc; /* restore it */
+               srcpos++;
+               if (dstpos == maxsize)
+                       break;
+               dst[dstpos++] = (char) wc;
+       }
+       dst[dstpos] = '\0';
+       return dstpos;
+#endif
  }
  /* I thought just fputwc(c, stdout) would work. But no... */
  static void BB_PUTCHAR(wchar_t c)
  {
         char buf[MB_CUR_MAX + 1];
         mbstate_t mbst = { 0 };
-       ssize_t len = wcrtomb(buf, c, &mbst);
+       ssize_t len;
  
+       if (unicode_is_inv_wchar(c))
+               c = CONFIG_SUBST_WCHAR;
+       len = wcrtomb(buf, c, &mbst);
         if (len > 0) {
                 buf[len] = '\0';
                 fputs(buf, stdout);
@@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize)
         return strlen(command_ps);
  }
  # if ENABLE_FEATURE_TAB_COMPLETION
-static void save_string(char *dst, int maxsize)
+static void save_string(char *dst, unsigned maxsize)
  {
         safe_strncpy(dst, command_ps, maxsize);
  }
@@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer)
   pushback:
                                 /* Invalid sequence. Save all "bad bytes" except first */
                                 read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1);
-                               /*
-                                * ic = unicode_buf[0] sounds even better, but currently
-                                * this does not work: wchar_t[] -> char[] conversion
-                                * when lineedit finishes mangles such "raw bytes"
-                                * (by misinterpreting them as unicode chars):
-                                */
+# if !ENABLE_UNICODE_PRESERVE_BROKEN
                                 ic = CONFIG_SUBST_WCHAR;
+# else
+                               ic = unicode_mark_inv_wchar(unicode_buf[0]);
+# endif
                         } else {
                                 /* Valid unicode char, return its code */
                                 ic = wc;
diff --git a/libbb/unicode.c b/libbb/unicode.c

index 83e70b4..d1c6167 100644 (file)
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs)
  # if LAST_SUPPORTED_WCHAR >= 0x300
         /* sorted list of non-overlapping intervals of non-spacing characters */
         /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
-       static const struct interval combining[] = {
  #  define BIG_(a,b) { a, b },
  #  define PAIR(a,b)
  #  define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
@@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs)
                 BIG_(0xFE20, 0xFE23) \
                 BIG_(0xFEFF, 0xFEFF) \
                 BIG_(0xFFF9, 0xFFFB)
-               ARRAY
+       static const struct interval combining[] = { ARRAY };
  #  undef BIG_
  #  undef PAIR
-       };
  #  define BIG_(a,b)
  #  define PAIR(a,b) (a << 2) | (b-a),
         static const uint16_t combining1[] = { ARRAY };
@@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
          * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
          * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
          */
-       static const struct interval rtl_b[] = {
  #  define BIG_(a,b) { a, b },
  #  define PAIR(a,b)
  #  define ARRAY \
@@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
                 {0x10E7F, 0x10FFF},
                 {0x1E800, 0x1EFFF}
                 */
-               ARRAY
+       static const struct interval rtl_b[] = { ARRAY };
  #  undef BIG_
  #  undef PAIR
-       };
  #  define BIG_(a,b)
  #  define PAIR(a,b) (a << 2) | (b-a),
         static const uint16_t rtl_p[] = { ARRAY };
@@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
          * White_Space, Other_Neutral, European_Number, European_Separator,
          * European_Terminator, Arabic_Number, Common_Separator
          */
-       static const struct interval neutral_b[] = {
  #  define BIG_(a,b) { a, b },
  #  define PAIR(a,b)
  #  define ARRAY \
@@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
                 {0x1F030, 0x1F093},
                 {0x1F100, 0x1F10A}
                 */
-               ARRAY
+       static const struct interval neutral_b[] = { ARRAY };
  #  undef BIG_
  #  undef PAIR
-       };
  #  define BIG_(a,b)
  #  define PAIR(a,b) (a << 2) | (b-a),
         static const uint16_t neutral_p[] = { ARRAY };
diff --git a/testsuite/ash.tests b/testsuite/ash.tests

index 6b2caf3..ce585be 100755 (executable)
--- a/testsuite/ash.tests
+++ b/testsuite/ash.tests
@@ -7,8 +7,30 @@
  
  . ./testing.sh
  
+test -f "$bindir/.config" && . "$bindir/.config"
+
  # testing "test name" "options" "expected result" "file input" "stdin"
  
+if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then
+testing "One byte which is not valid unicode char followed by valid input" \
+       "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
+       "\
+00000000  ff 2d 0a                                          |.-.|
+00000003
+" \
+       "" \
+       "echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
+
+testing "30 bytes which are not valid unicode chars followed by valid input" \
+       "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
+       "\
+00000000  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff  |................|
+00000010  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff 2d 0a  |..............-.|
+00000020
+" \
+       "" \
+       "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
+else
  testing "One byte which is not valid unicode char followed by valid input" \
         "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
         "\
@@ -27,6 +49,8 @@ testing "30 bytes which are not valid unicode chars followed by valid input" \
  " \
         "" \
         "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
+fi
+
  
  # Not sure this behavior is perfect: we lose all invalid input which precedes
  # arrow keys and such. In this example, \xff\xff are lost
author	Tomas Heinrich <heinrich.tomas@gmail.com>
	Thu, 29 Apr 2010 11:43:39 +0000 (13:43 +0200)
committer	Denys Vlasenko <vda.linux@googlemail.com>
	Thu, 29 Apr 2010 11:43:39 +0000 (13:43 +0200)
Config.in		patch \| blob \| history
libbb/lineedit.c		patch \| blob \| history
libbb/unicode.c		patch \| blob \| history
testsuite/ash.tests		patch \| blob \| history