From ca93b89efc0dce911b48015df02eaeaeb3d52572 Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Sat, 8 Dec 2012 10:23:20 +1100 Subject: [PATCH] Incomplete implementation of $/ = \number acting like read() It's under tested, and incomplete - readline appears to ignore IN_BYTES, so this code continues to do so. - currently :utf8 will return invalid utf8, which means this can too, if we can be sure of :utf8 returning only valud utf-8 the FIXME can be ignored - VMS is the elephant in the room - the conditional means that the new code is completely ignored for reading from files. If we can detect record-oriented files in some way this could change. --- sv.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- t/io/utf8.t | 31 ++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/sv.c b/sv.c index 284c16a..78eeddd 100644 --- a/sv.c +++ b/sv.c @@ -7670,7 +7670,7 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append) SSize_t bytesread; const STRLEN recsize = SvUV(SvRV(PL_rs)); /* RsRECORD() guarantees > 0. */ /* Grab the size of the record we're getting */ - char *const buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append; + char *buffer = SvGROW(sv, (STRLEN)(recsize + append + 1)) + append; #ifdef VMS int fd; #endif @@ -7690,6 +7690,64 @@ S_sv_gets_read_record(pTHX_ SV *const sv, PerlIO *const fp, I32 append) #endif { bytesread = PerlIO_read(fp, buffer, recsize); + + /* At this point, the logic in sv_get() means that sv will + be treated as utf-8 if the handle is utf8. + */ + if (PerlIO_isutf8(fp) && bytesread > 0) { + char *bend = buffer + bytesread; + char *bufp = buffer; + size_t charcount = 0; + bool charstart = TRUE; + STRLEN skip = 0; + + while (charcount < recsize) { + /* count accumulated characters */ + while (bufp < bend) { + if (charstart) { + skip = UTF8SKIP(bufp); + } + if (bufp + skip > bend) { + /* partial at the end */ + charstart = FALSE; + break; + } + else { + ++charcount; + bufp += skip; + charstart = TRUE; + } + } + + if (!charstart) { + /* read the rest of the current character, and maybe the + beginning of the next, if we need it */ + STRLEN readsize = skip - (bend - bufp) + (charcount + 1 < recsize); + STRLEN bufp_offset = bufp - buffer; + SSize_t morebytesread; + + buffer = SvGROW(sv, append + bytesread + readsize + 1) + append; + bend = buffer + bytesread; + morebytesread = PerlIO_read(fp, bend, readsize); + if (morebytesread <= 0) { + /* we're done, if we still have incomplete + characters the check code in sv_gets() will + warn and zero them. + + FIXME: If we've read more than one lead + character for an incomplete character, push + it back. + */ + break; + } + + /* prepare to scan some more */ + bytesread += morebytesread; + bend = buffer + bytesread; + bufp = buffer + bufp_offset; + } + } + } } if (bytesread < 0) diff --git a/t/io/utf8.t b/t/io/utf8.t index 4b01747..ed535a3 100644 --- a/t/io/utf8.t +++ b/t/io/utf8.t @@ -10,7 +10,7 @@ BEGIN { no utf8; # needed for use utf8 not griping about the raw octets -plan(tests => 55); +plan(tests => 58); $| = 1; @@ -348,3 +348,32 @@ is($failed, undef); "<:utf8 rcatline must warn about bad utf8"); close F; } + +{ + # fixed record reads + open F, ">:utf8", $a_file; + print F "foo\xE4"; + print F "bar\xFE"; + close F; + open F, "<:utf8", $a_file; + local $/ = \4; + my $line = ; + is($line, "foo\xE4", "readline with \$/ = \\4"); + $line .= ; + is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4"); + close F; + + # badly encoded at EOF + open F, ">:raw", $a_file; + print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl + close F; + + use warnings 'utf8'; + open F, "<:utf8", $a_file; + undef $@; + local $SIG{__WARN__} = sub { $@ = shift }; + $line = ; + + like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ chunk 1/, + "<:utf8 readline (fixed) must warn about bad utf8"); +} -- 2.7.4