setlocale(LC_CTYPE, "utf-8");
wbuf = (wchar_t *) safemalloc(wlen);
- /* utf8_to_uvuni(pathname, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+ /* utf8_to_uvuni_buf(pathname, pathname + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
wlen = mbsrtowcs(wbuf, (const char**)&buf, wlen, &mbs);
if (oldlocale) setlocale(LC_CTYPE, oldlocale);
mbstate_t mbs;
char *oldlocale = setlocale(LC_CTYPE, NULL);
setlocale(LC_CTYPE, "utf-8");
- /* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+ /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
if (wlen > 0)
err = cygwin_conv_path(what, wpath, wbuf, wlen);
setlocale(LC_CTYPE, "utf-8");
if (!IN_BYTES) {
mbstate_t mbs;
- /* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+ /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
if (wlen > 0)
err = cygwin_conv_path(what, wpath, wbuf, wlen);
package Data::Dumper;
BEGIN {
- $VERSION = '2.135_05'; # Don't forget to set version and release
+ $VERSION = '2.135_06'; # Don't forget to set version and release
} # date in POD!
#$| = 1;
=head1 VERSION
-Version 2.135_05 (February 18 2012)
+Version 2.135_06 (March 20 2012)
=head1 SEE ALSO
# endif
UV
-Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
+Perl_utf8_to_uvchr_buf(pTHX_ U8 *s, U8 *send, STRLEN *retlen)
{
- const UV uv = utf8_to_uv(s, UTF8_MAXLEN, retlen,
+ const UV uv = utf8_to_uv(s, send - s, retlen,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
return UNI_TO_NATIVE(uv);
}
# if !defined(PERL_IMPLICIT_CONTEXT)
-# define utf8_to_uvchr Perl_utf8_to_uvchr
+# define utf8_to_uvchr_buf Perl_utf8_to_uvchr_buf
# else
-# define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b)
+# define utf8_to_uvchr_buf(a,b) Perl_utf8_to_uvchr_buf(aTHX_ a,b)
# endif
#endif /* PERL_VERSION <= 6 */
/* this will need EBCDICification */
for (s = src; s < send; s += increment) {
- const UV k = utf8_to_uvchr((U8*)s, NULL);
+ const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
/* check for invalid utf8 */
increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s);
*r++ = '"';
for (s = src; s < send; s += UTF8SKIP(s)) {
- const UV k = utf8_to_uvchr((U8*)s, NULL);
+ const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
if (k == '"' || k == '\\' || k == '$' || k == '@') {
*r++ = '\\';
isuni = 1;
for ( ; (pv < end && (!max || (wrote < max))) ; pv += readsize ) {
- const UV u= (isuni) ? utf8_to_uvchr((U8*)pv, &readsize) : (U8)*pv;
+ const UV u= (isuni) ? utf8_to_uvchr_buf((U8*)pv, (U8*) end, &readsize) : (U8)*pv;
const U8 c = (U8)u & 0xFF;
if ( ( u > 255 )
retry:
while (pv < e) {
if (utf8) {
- c = utf8_to_uvchr((U8*)pv, &cl);
+ c = utf8_to_uvchr_buf((U8*)pv, (U8*)e, &cl);
if (cl == 0) {
SvCUR(dsv) = dsvcur;
pv = start;
use warnings;
use Carp;
-our $VERSION = '0.36';
+our $VERSION = '0.37';
require XSLoader;
const char *const end = p + len;
while (p < end) {
STRLEN len;
- UV chr = utf8_to_uvuni((U8 *)p, &len);
+ UV chr = utf8_to_uvuni_buf((U8 *)p, (U8 *) end, &len);
new_p = (char *)uvuni_to_utf8((U8 *)new_p, chr ^ 32);
p += len;
}
*((p)+1))) \
: function(p))
-/* Note that all ignore 'use bytes' */
+/* Note that all assume that the utf8 has been validated, and ignore 'use
+ * bytes' */
#define isALNUM_utf8(p) generic_utf8(isWORDCHAR, is_utf8_alnum, p)
/* To prevent S_scan_word in toke.c from hanging, we have to make sure that
: isSPACE_utf8(p)))
#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */
-#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(utf8_to_uvchr(p, 0))
-#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(utf8_to_uvchr(p, 0))
+#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(valid_utf8_to_uvchr(p, 0))
+#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(valid_utf8_to_uvchr(p, 0))
#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f')
#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */
whether the byte can be encoded as a single byte even in UTF-8):
U8 *utf;
+ U8 *utf_end; /* 1 beyond buffer pointed to by utf */
UV uv; /* Note: a UV, not a U8, not a char */
STRLEN len; /* length of character in bytes */
if (!UTF8_IS_INVARIANT(*utf))
/* Must treat this as UTF-8 */
- uv = utf8_to_uvchr(utf, &len);
+ uv = utf8_to_uvchr_buf(utf, utf_end, &len);
else
/* OK to treat this character as a byte */
uv = *utf;
-You can also see in that example that we use C<utf8_to_uvchr> to get the
+You can also see in that example that we use C<utf8_to_uvchr_buf> to get the
value of the character; the inverse function C<uvchr_to_utf8> is available
for putting a UV into UTF-8:
=item *
-If a string is UTF-8, B<always> use C<utf8_to_uvchr> to get at the value,
+If a string is UTF-8, B<always> use C<utf8_to_uvchr_buf> to get at the value,
unless C<UTF8_IS_INVARIANT(*s)> in which case you can use C<*s>.
=item *
=item *
-C<utf8_to_uvchr(buf, lenp)> reads UTF-8 encoded bytes from a buffer and
+C<utf8_to_uvchr_buf(buf, bufend, lenp)> reads UTF-8 encoded bytes from a
+buffer and
returns the Unicode character code point and, optionally, the length of
the UTF-8 byte sequence. It works appropriately on EBCDIC machines.
sv_recode_to_utf8(TARG, PL_encoding);
tmps = SvPVX(TARG);
if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
- UNICODE_IS_REPLACEMENT(utf8_to_uvchr((U8*)tmps, NULL))) {
+ UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) {
SvGROW(TARG, 2);
tmps = SvPVX(TARG);
SvCUR_set(TARG, 1);
uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
cBOOL(IN_LOCALE_RUNTIME), &tainted);
if (uv == GREEK_CAPITAL_LETTER_IOTA
- && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
+ && utf8_to_uvchr_buf(s, send, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
{
in_iota_subscript = TRUE;
}
continue;
}
else {
- if (!utf8_to_uvchr(s, 0))
+ if (!utf8_to_uvchr_buf(s, send, 0))
break;
up = (char*)s;
s += UTF8SKIP(s);
UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
} else {
uc = *((U8*)STRING(scan));
}
}
if (UTF) {
const U8 * const s = (U8 *)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
}
else if (has_exactf_sharp_s) {
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+
+ /* tmpbuf has been constructed by us, so we
+ * know it is valid utf8 */
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
s += unilen;
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
len += unilen;
#else
dTHX;
for (i = 0; i < nUtf8; i+= UTF8SKIP(pUtf8 + i)) {
- unsigned long u = utf8_to_uvchr((U8*)(pUtf8 + i), 0);
+ unsigned long u = utf8_to_uvchr_buf((U8*)(pUtf8 + i),
+ (U8*)(pUtf8 + nUtf8),
+ 0);
if (u > 0xFF) {
iConsole->Printf(_L("(keycode > 0xFF)\n"));
buf[i] = 0;
dTHX;
if (is_utf8_string((U8*)buf, n)) {
for (int i = 0; i < n; i += UTF8SKIP(buf + i)) {
- TChar u = utf8_to_uvchr((U8*)(buf + i), 0);
+ TChar u = valid_utf8_to_uvchr((U8*)(buf + i), 0);
iConsole->Printf(_L("%c"), u);
wrote++;
}
utf8.c AOK
- [utf8_to_uvchr]
+ [utf8_to_uvchr_buf]
Malformed UTF-8 character
my $a = ord "\x80" ;
<<<<<< Add a test when something actually calls utf16_to_utf8
__END__
-# utf8.c [utf8_to_uvchr] -W
+# utf8.c [utf8_to_uvchr_buf] -W
BEGIN {
if (ord('A') == 193) {
print "SKIPPED\n# ebcdic platforms do not generate Malformed UTF-8 warnings.";
termlen = 1;
}
else {
- termcode = utf8_to_uvchr((U8*)s, &termlen);
+ termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
Copy(s, termstr, termlen, U8);
if (!UTF8_IS_INVARIANT(term))
has_utf8 = TRUE;
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
-If C<s> does not point to a well-formed UTF-8 character, zero is
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer.
+Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, zero is
returned and C<retlen> is set, if possible, to -1.
=cut
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
- return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ return valid_utf8_to_uvchr(s, retlen);
}
/*
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
-This function should only be used when the returned UV is considered
-an index into the Unicode semantic tables (e.g. swashes).
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer.
+Use L</utf8_to_uvuni_buf> instead.
-If C<s> does not point to a well-formed UTF-8 character, zero is
+If C<s> points to one of the detected malformations, zero is
returned and C<retlen> is set, if possible, to -1.
=cut
{
PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
- /* Call the low level routine asking for checks */
- return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ return valid_utf8_to_uvuni(s, retlen);
}
/*
d = s = save;
while (s < send) {
STRLEN ulen;
- *d++ = (U8)utf8_to_uvchr(s, &ulen);
+ *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
s += ulen;
}
*d = '\0';
dVAR;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
STRLEN len = 0;
- const UV uv0 = utf8_to_uvchr(p, NULL);
+ const UV uv0 = valid_utf8_to_uvchr(p, NULL);
/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
* are necessary in EBCDIC, they are redundant no-ops
* in ASCII-ish platforms, and hopefully optimized away. */
bad_crossing:
/* Failed, have to return the original */
- original = utf8_to_uvchr(p, lenp);
+ original = valid_utf8_to_uvchr(p, lenp);
Copy(p, ustrp, *lenp, char);
return original;
}
"unexpectedly is not a string, flags=%lu",
(unsigned long)SvFLAGS(sv_to));
}
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
+ /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
/* Each key in the inverse list is a mapped-to value, and the key's
* hash value is a list of the strings (each in utf8) that map to
Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
}
- /* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
+ /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
for (j = 0; j <= av_len(from_list); j++) {
entryp = av_fetch(from_list, j, FALSE);
if (entryp == NULL) {
}
/* When i==j this adds itself to the list */
- av_push(i_list, newSVuv(utf8_to_uvchr(
- (U8*) SvPVX(*entryp), 0)));
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
+ av_push(i_list, newSVuv(utf8_to_uvchr_buf(
+ (U8*) SvPVX(*entryp),
+ (U8*) SvPVX(*entryp) + SvCUR(*entryp),
+ 0)));
+ /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
}
}
}
STRLEN char_len;
if (UTF8_IS_SUPER(s)) {
if (ckWARN_d(WARN_NON_UNICODE)) {
- UV uv = utf8_to_uvchr(s, &char_len);
+ UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
ok = FALSE;
}
else if (UTF8_IS_SURROGATE(s)) {
if (ckWARN_d(WARN_SURROGATE)) {
- UV uv = utf8_to_uvchr(s, &char_len);
+ UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
ok = FALSE;
((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
&& (ckWARN_d(WARN_NONCHAR)))
{
- UV uv = utf8_to_uvchr(s, &char_len);
+ UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
ok = FALSE;
truncated++;
break;
}
- u = utf8_to_uvchr((U8*)s, 0);
+ u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
if (u < 256) {
const unsigned char c = (unsigned char)u & 0xFF;
if (flags & UNI_DISPLAY_BACKSLASH) {