Re: [PATCH] fix unicode split /\s+/

author SADAHIRO Tomoyuki <BQW10602@nifty.com>

Sat, 20 Jan 2007 00:52:42 +0000 (09:52 +0900)

committer Nicholas Clark <nick@ccl4.org>

Fri, 19 Jan 2007 21:11:40 +0000 (21:11 +0000)
author SADAHIRO Tomoyuki <BQW10602@nifty.com>
Sat, 20 Jan 2007 00:52:42 +0000 (09:52 +0900)
committer Nicholas Clark <nick@ccl4.org>
Fri, 19 Jan 2007 21:11:40 +0000 (21:11 +0000)
diff --git a/pp.c b/pp.c

index 4b021c0..977c2b7 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -4590,7 +4590,11 @@ PP(pp_split)
      base = SP - PL_stack_base;
      orig = s;
      if (pm->op_pmflags & PMf_SKIPWHITE) {
-       if (pm->op_pmflags & PMf_LOCALE) {
+       if (do_utf8) {
+           while (*s == ' ' || is_utf8_space((U8*)s))
+               s += UTF8SKIP(s);
+       }
+       else if (pm->op_pmflags & PMf_LOCALE) {
             while (isSPACE_LC(*s))
                 s++;
         }
@@ -4606,22 +4610,18 @@ PP(pp_split)
      if (!limit)
         limit = maxiters + 2;
      if (pm->op_pmflags & PMf_WHITE) {
-        if (do_utf8 && !PL_utf8_space) { 
-            /* force PL_utf8_space to be loaded */
-            bool ok; 
-            ENTER; 
-            ok = is_utf8_space((const U8*)" "); 
-            assert(ok); 
-            LEAVE; 
-        } 
         while (--limit) {
             m = s;
             /* this one uses 'm' and is a negative test */
             if (do_utf8) {
-               STRLEN uskip;
-                while (m < strend &&
-                       !( *m == ' ' || swash_fetch(PL_utf8_space,(U8*)m, do_utf8) ))
-                   m +=  UTF8SKIP(m);
+               while (m < strend && !( *m == ' ' || is_utf8_space((U8*)m) )) {
+                   const int t = UTF8SKIP(m);
+                   /* is_utf8_space returns FALSE for malform utf8 */
+                   if (strend - m < t)
+                       m = strend;
+                   else
+                       m += t;
+               }
              } else if (pm->op_pmflags & PMf_LOCALE) {
                 while (m < strend && !isSPACE_LC(*m))
                     ++m;
@@ -4639,11 +4639,15 @@ PP(pp_split)
                 (void)SvUTF8_on(dstr);
             XPUSHs(dstr);
  
-           s = m + 1;
+           /* skip the whitespace found last */
+           if (do_utf8)
+               s = m + UTF8SKIP(m);
+           else
+               s = m + 1;
+
             /* this one uses 's' and is a positive test */
             if (do_utf8) {
-                while (s < strend &&
-                       ( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8) ))
+               while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) ))
                     s +=  UTF8SKIP(s);
              } else if (pm->op_pmflags & PMf_LOCALE) {
                 while (s < strend && isSPACE_LC(*s))
diff --git a/t/op/split.t b/t/op/split.t

index f5d0c41..b6d7570 100755 (executable)
--- a/t/op/split.t
+++ b/t/op/split.t
@@ -6,7 +6,7 @@ BEGIN {
      require './test.pl';
  }
  
-plan tests => 80;
+plan tests => 130;
  
  $FS = ':';
  
@@ -309,11 +309,15 @@ ok(@ary == 3 &&
      # For terms of use, see http://www.unicode.org/terms_of_use.html
      # For documentation, see UCD.html
      my @spaces=(
-        0x0009..0x000A, # Cc   [5] <control-0009>..<control-000D>
-        0x000C..0x000D, # EXCLUDING \v aka ctl-000B aka vert-tab
-        0x0020,         # Zs       SPACE
-        0x0085,         # Cc       <control-0085>
-        0x00A0,         # Zs       NO-BREAK SPACE
+       ord("\t"),      # Cc       <control-0009>
+       ord("\n"),      # Cc       <control-000A>
+       # not PerlSpace # Cc       <control-000B>
+       ord("\f"),      # Cc       <control-000C>
+       ord("\r"),      # Cc       <control-000D>
+       ord(" "),       # Zs       SPACE
+       ord("\N{NEL}"), # Cc       <control-0085>
+       ord("\N{NO-BREAK SPACE}"),
+                       # Zs       NO-BREAK SPACE
          0x1680,         # Zs       OGHAM SPACE MARK
          0x180E,         # Zs       MONGOLIAN VOWEL SEPARATOR
          0x2000..0x200A, # Zs  [11] EN QUAD..HAIR SPACE
@@ -325,10 +329,21 @@ ok(@ary == 3 &&
      );
      #diag "Have @{[0+@spaces]} to test\n";
      foreach my $cp (@spaces) {
+       my $msg = sprintf "Space: U+%04x", $cp;
          my $space = chr($cp);
-        my $str="A:$space:B\x{FFFF}";
+        my $str="A:$space:B\x{FFFD}";
          chop $str;
+
          my @res=split(/\s+/,$str);
-        is(0+@res,2) or do { diag sprintf "Char failed: 0x%x",$cp }
+        ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
+
+        my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
+        chop $s2;
+
+        my @r2 = split(' ',$s2);
+        ok(@r2 == 2 && join('-', @r2) eq ":A:-:B",  "$msg - ' '");
+
+        my @r3 = split(/\s+/, $s2);
+        ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
      }
  }
author	SADAHIRO Tomoyuki <BQW10602@nifty.com>
	Sat, 20 Jan 2007 00:52:42 +0000 (09:52 +0900)
committer	Nicholas Clark <nick@ccl4.org>
	Fri, 19 Jan 2007 21:11:40 +0000 (21:11 +0000)
pp.c		patch \| blob \| history
t/op/split.t		patch \| blob \| history