Incomplete implementation of $/ = \number acting like read()
authorTony Cook <tony@develop-help.com>
Fri, 7 Dec 2012 23:23:20 +0000 (10:23 +1100)
committerTony Cook <tony@develop-help.com>
Sat, 8 Dec 2012 22:32:43 +0000 (09:32 +1100)
It's under tested, and incomplete

- readline appears to ignore IN_BYTES, so this code continues to do so.

- currently :utf8 will return invalid utf8, which means this can too,
  if we can be sure of :utf8 returning only valud utf-8 the FIXME can
  be ignored

- VMS is the elephant in the room - the conditional means that the new
  code is completely ignored for reading from files.  If we can detect
  record-oriented files in some way this could change.

sv.c
t/io/utf8.t

diff --git a/sv.c b/sv.c
index 284c16a..78eeddd 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -7670,7 +7670,7 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
     SSize_t bytesread;
     const STRLEN recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */
       /* Grab the size of the record we're getting */
-    char *const buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
+    char *buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
 #ifdef VMS
     int fd;
 #endif
@@ -7690,6 +7690,64 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
 #endif
     {
        bytesread = PerlIO_read(fp, buffer, recsize);
+
+       /* At this point, the logic in sv_get() means that sv will
+          be treated as utf-8 if the handle is utf8.
+       */
+       if (PerlIO_isutf8(fp) && bytesread > 0) {
+           char *bend = buffer + bytesread;
+           char *bufp = buffer;
+           size_t charcount = 0;
+           bool charstart = TRUE;
+           STRLEN skip = 0;
+
+           while (charcount < recsize) {
+               /* count accumulated characters */
+               while (bufp < bend) {
+                   if (charstart) {
+                       skip = UTF8SKIP(bufp);
+                   }
+                   if (bufp + skip > bend) {
+                       /* partial at the end */
+                       charstart = FALSE;
+                       break;
+                   }
+                   else {
+                       ++charcount;
+                       bufp += skip;
+                       charstart = TRUE;
+                   }
+               }
+
+               if (!charstart) {
+                   /* read the rest of the current character, and maybe the
+                      beginning of the next, if we need it */
+                   STRLEN readsize = skip - (bend - bufp) + (charcount + 1 < recsize);
+                   STRLEN bufp_offset = bufp - buffer;
+                   SSize_t morebytesread;
+
+                   buffer = SvGROW(sv, append + bytesread + readsize + 1) + append;
+                   bend = buffer + bytesread;
+                   morebytesread = PerlIO_read(fp, bend, readsize);
+                   if (morebytesread <= 0) {
+                       /* we're done, if we still have incomplete
+                          characters the check code in sv_gets() will
+                          warn and zero them.
+
+                          FIXME: If we've read more than one lead
+                          character for an incomplete character, push
+                          it back.
+                       */
+                       break;
+                   }
+
+                   /* prepare to scan some more */
+                   bytesread += morebytesread;
+                   bend = buffer + bytesread;
+                   bufp = buffer + bufp_offset;
+               }
+           }
+       }
     }
 
     if (bytesread < 0)
index 4b01747..ed535a3 100644 (file)
@@ -10,7 +10,7 @@ BEGIN {
 no utf8; # needed for use utf8 not griping about the raw octets
 
 
-plan(tests => 55);
+plan(tests => 58);
 
 $| = 1;
 
@@ -348,3 +348,32 @@ is($failed, undef);
          "<:utf8 rcatline must warn about bad utf8");
     close F;
 }
+
+{
+    # fixed record reads
+    open F, ">:utf8", $a_file;
+    print F "foo\xE4";
+    print F "bar\xFE";
+    close F;
+    open F, "<:utf8", $a_file;
+    local $/ = \4;
+    my $line = <F>;
+    is($line, "foo\xE4", "readline with \$/ = \\4");
+    $line .= <F>;
+    is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4");
+    close F;
+
+    # badly encoded at EOF
+    open F, ">:raw", $a_file;
+    print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl
+    close F;
+
+    use warnings 'utf8';
+    open F, "<:utf8", $a_file;
+    undef $@;
+    local $SIG{__WARN__} = sub { $@ = shift };
+    $line = <F>;
+
+    like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/,
+         "<:utf8 readline (fixed) must warn about bad utf8");
+}