fix another boundary case and hopefully improve performance

author Tony Cook <tony@develop-help.com>

Sat, 17 Mar 2012 01:54:17 +0000 (12:54 +1100)

committer Tony Cook <tony@develop-help.com>

Sat, 8 Dec 2012 22:32:45 +0000 (09:32 +1100)
author Tony Cook <tony@develop-help.com>
Sat, 17 Mar 2012 01:54:17 +0000 (12:54 +1100)
committer Tony Cook <tony@develop-help.com>
Sat, 8 Dec 2012 22:32:45 +0000 (09:32 +1100)
diff --git a/sv.c b/sv.c

index e2bed05..34dc534 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -7720,13 +7720,26 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
                 }
  
                 if (charcount < recsize) {
-                   /* read the rest of the current character, and maybe the
-                      beginning of the next, if we need it */
-                   STRLEN readsize = (charstart ? 0 : skip - (bend - bufp))
-                       + (charcount + 1 < recsize);
+                   STRLEN readsize;
                     STRLEN bufp_offset = bufp - buffer;
                     SSize_t morebytesread;
  
+                   /* originally I read enough to fill any incomplete
+                      character and the first byte of the next
+                      character if needed, but if there's many
+                      multi-byte encoded characters we're going to be
+                      making a read call for every character beyond
+                      the original read size.
+
+                      So instead, read the rest of the character if
+                      any, and enough bytes to match at least the
+                      start bytes for each character we're going to
+                      read.
+                   */
+                   if (charstart)
+                       readsize = recsize - charcount;
+                   else 
+                       readsize = skip - (bend - bufp) + recsize - charcount - 1;
                     buffer = SvGROW(sv, append + bytesread + readsize + 1) + append;
                     bend = buffer + bytesread;
                     morebytesread = PerlIO_read(fp, bend, readsize);
diff --git a/t/io/utf8.t b/t/io/utf8.t

index 919b734..29beba2 100644 (file)
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
  no utf8; # needed for use utf8 not griping about the raw octets
  
  
-plan(tests => 59);
+plan(tests => 61);
  
  $| = 1;
  
@@ -354,6 +354,8 @@ is($failed, undef);
      open F, ">:utf8", $a_file;
      print F "foo\xE4";
      print F "bar\xFE";
+    print F "\xC0\xC8\xCC\xD2";
+    print F "a\xE4ab";
      print F "a\xE4a";
      close F;
      open F, "<:utf8", $a_file;
@@ -363,6 +365,10 @@ is($failed, undef);
      $line .= <F>;
      is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4");
      $line = <F>;
+    is($line, "\xC0\xC8\xCC\xD2", "readline with several encoded characters");
+    $line = <F>;
+    is($line, "a\xE4ab", "readline with another boundary condition");
+    $line = <F>;
      is($line, "a\xE4a", "readline with boundary condition");
      close F;
author	Tony Cook <tony@develop-help.com>
	Sat, 17 Mar 2012 01:54:17 +0000 (12:54 +1100)
committer	Tony Cook <tony@develop-help.com>
	Sat, 8 Dec 2012 22:32:45 +0000 (09:32 +1100)