From ca93b89efc0dce911b48015df02eaeaeb3d52572 Mon Sep 17 00:00:00 2001
From: Tony Cook <tony@develop-help.com>
Date: Sat, 8 Dec 2012 10:23:20 +1100
Subject: [PATCH] Incomplete implementation of $/ = \number acting like read()

It's under tested, and incomplete

- readline appears to ignore IN_BYTES, so this code continues to do so.

- currently :utf8 will return invalid utf8, which means this can too,
  if we can be sure of :utf8 returning only valud utf-8 the FIXME can
  be ignored

- VMS is the elephant in the room - the conditional means that the new
  code is completely ignored for reading from files.  If we can detect
  record-oriented files in some way this could change.
---
 sv.c        | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 t/io/utf8.t | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 89 insertions(+), 2 deletions(-)
diff --git a/sv.c b/sv.c
index 284c16a..78eeddd 100644
--- a/sv.c
+++ b/sv.c
@@ -7670,7 +7670,7 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
     SSize_t bytesread;
     const STRLEN recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */
       /* Grab the size of the record we're getting */
-    char *const buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
+    char *buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append;
 #ifdef VMS
     int fd;
 #endif
@@ -7690,6 +7690,64 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append)
 #endif
     {
 	bytesread = PerlIO_read(fp, buffer, recsize);
+
+	/* At this point, the logic in sv_get() means that sv will
+	   be treated as utf-8 if the handle is utf8.
+	*/
+	if (PerlIO_isutf8(fp) && bytesread > 0) {
+	    char *bend = buffer + bytesread;
+	    char *bufp = buffer;
+	    size_t charcount = 0;
+	    bool charstart = TRUE;
+	    STRLEN skip = 0;
+
+	    while (charcount < recsize) {
+		/* count accumulated characters */
+		while (bufp < bend) {
+		    if (charstart) {
+			skip = UTF8SKIP(bufp);
+		    }
+		    if (bufp + skip > bend) {
+			/* partial at the end */
+			charstart = FALSE;
+			break;
+		    }
+		    else {
+			++charcount;
+			bufp += skip;
+			charstart = TRUE;
+		    }
+		}
+
+		if (!charstart) {
+		    /* read the rest of the current character, and maybe the
+		       beginning of the next, if we need it */
+		    STRLEN readsize = skip - (bend - bufp) + (charcount + 1 < recsize);
+		    STRLEN bufp_offset = bufp - buffer;
+		    SSize_t morebytesread;
+
+		    buffer = SvGROW(sv, append + bytesread + readsize + 1) + append;
+		    bend = buffer + bytesread;
+		    morebytesread = PerlIO_read(fp, bend, readsize);
+		    if (morebytesread <= 0) {
+			/* we're done, if we still have incomplete
+			   characters the check code in sv_gets() will
+			   warn and zero them.
+
+			   FIXME: If we've read more than one lead
+			   character for an incomplete character, push
+			   it back.
+			*/
+			break;
+		    }
+
+		    /* prepare to scan some more */
+		    bytesread += morebytesread;
+		    bend = buffer + bytesread;
+		    bufp = buffer + bufp_offset;
+		}
+	    }
+	}
     }
 
     if (bytesread < 0)
diff --git a/t/io/utf8.t b/t/io/utf8.t
index 4b01747..ed535a3 100644
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
 no utf8; # needed for use utf8 not griping about the raw octets
 
 
-plan(tests => 55);
+plan(tests => 58);
 
 $| = 1;
 
@@ -348,3 +348,32 @@ is($failed, undef);
 	  "<:utf8 rcatline must warn about bad utf8");
     close F;
 }
+
+{
+    # fixed record reads
+    open F, ">:utf8", $a_file;
+    print F "foo\xE4";
+    print F "bar\xFE";
+    close F;
+    open F, "<:utf8", $a_file;
+    local $/ = \4;
+    my $line = <F>;
+    is($line, "foo\xE4", "readline with \$/ = \\4");
+    $line .= <F>;
+    is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4");
+    close F;
+
+    # badly encoded at EOF
+    open F, ">:raw", $a_file;
+    print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl
+    close F;
+
+    use warnings 'utf8';
+    open F, "<:utf8", $a_file;
+    undef $@;
+    local $SIG{__WARN__} = sub { $@ = shift };
+    $line = <F>;
+
+    like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/,
+	  "<:utf8 readline (fixed) must warn about bad utf8");
+}
-- 
2.7.4