Incomplete implementation of $/ = \number acting like read()

author Tony Cook <tony@develop-help.com>

Fri, 7 Dec 2012 23:23:20 +0000 (10:23 +1100)

committer Tony Cook <tony@develop-help.com>

Sat, 8 Dec 2012 22:32:43 +0000 (09:32 +1100)
author Tony Cook <tony@develop-help.com>
Fri, 7 Dec 2012 23:23:20 +0000 (10:23 +1100)
committer Tony Cook <tony@develop-help.com>
Sat, 8 Dec 2012 22:32:43 +0000 (09:32 +1100)
diff --git a/sv.c b/sv.c

index 284c16a..78eeddd 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -7670,7 +7670,7 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
      SSize_t bytesread;
      const STRLEN recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */
        /* Grab the size of the record we're getting */
-    char *const buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
+    char *buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
  #ifdef VMS
      int fd;
  #endif
@@ -7690,6 +7690,64 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
  #endif
      {
         bytesread = PerlIO_read(fp, buffer, recsize);
+
+       /* At this point, the logic in sv_get() means that sv will
+          be treated as utf-8 if the handle is utf8.
+       */
+       if (PerlIO_isutf8(fp) && bytesread > 0) {
+           char *bend = buffer + bytesread;
+           char *bufp = buffer;
+           size_t charcount = 0;
+           bool charstart = TRUE;
+           STRLEN skip = 0;
+
+           while (charcount < recsize) {
+               /* count accumulated characters */
+               while (bufp < bend) {
+                   if (charstart) {
+                       skip = UTF8SKIP(bufp);
+                   }
+                   if (bufp + skip > bend) {
+                       /* partial at the end */
+                       charstart = FALSE;
+                       break;
+                   }
+                   else {
+                       ++charcount;
+                       bufp += skip;
+                       charstart = TRUE;
+                   }
+               }
+
+               if (!charstart) {
+                   /* read the rest of the current character, and maybe the
+                      beginning of the next, if we need it */
+                   STRLEN readsize = skip - (bend - bufp) + (charcount + 1 < recsize);
+                   STRLEN bufp_offset = bufp - buffer;
+                   SSize_t morebytesread;
+
+                   buffer = SvGROW(sv, append + bytesread + readsize + 1) + append;
+                   bend = buffer + bytesread;
+                   morebytesread = PerlIO_read(fp, bend, readsize);
+                   if (morebytesread <= 0) {
+                       /* we're done, if we still have incomplete
+                          characters the check code in sv_gets() will
+                          warn and zero them.
+
+                          FIXME: If we've read more than one lead
+                          character for an incomplete character, push
+                          it back.
+                       */
+                       break;
+                   }
+
+                   /* prepare to scan some more */
+                   bytesread += morebytesread;
+                   bend = buffer + bytesread;
+                   bufp = buffer + bufp_offset;
+               }
+           }
+       }
      }
  
      if (bytesread < 0)
diff --git a/t/io/utf8.t b/t/io/utf8.t

index 4b01747..ed535a3 100644 (file)
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
  no utf8; # needed for use utf8 not griping about the raw octets
  
  
-plan(tests => 55);
+plan(tests => 58);
  
  $| = 1;
  
@@ -348,3 +348,32 @@ is($failed, undef);
           "<:utf8 rcatline must warn about bad utf8");
      close F;
  }
+
+{
+    # fixed record reads
+    open F, ">:utf8", $a_file;
+    print F "foo\xE4";
+    print F "bar\xFE";
+    close F;
+    open F, "<:utf8", $a_file;
+    local $/ = \4;
+    my $line = <F>;
+    is($line, "foo\xE4", "readline with \$/ = \\4");
+    $line .= <F>;
+    is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4");
+    close F;
+
+    # badly encoded at EOF
+    open F, ">:raw", $a_file;
+    print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl
+    close F;
+
+    use warnings 'utf8';
+    open F, "<:utf8", $a_file;
+    undef $@;
+    local $SIG{__WARN__} = sub { $@ = shift };
+    $line = <F>;
+
+    like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/,
+         "<:utf8 readline (fixed) must warn about bad utf8");
+}
author	Tony Cook <tony@develop-help.com>
	Fri, 7 Dec 2012 23:23:20 +0000 (10:23 +1100)
committer	Tony Cook <tony@develop-help.com>
	Sat, 8 Dec 2012 22:32:43 +0000 (09:32 +1100)