Use the new utf8 to code point functions

author Karl Williamson <public@khwilliamson.com>

Mon, 19 Mar 2012 21:38:06 +0000 (15:38 -0600)

committer Karl Williamson <public@khwilliamson.com>

Tue, 20 Mar 2012 00:23:44 +0000 (18:23 -0600)
author Karl Williamson <public@khwilliamson.com>
Mon, 19 Mar 2012 21:38:06 +0000 (15:38 -0600)
committer Karl Williamson <public@khwilliamson.com>
Tue, 20 Mar 2012 00:23:44 +0000 (18:23 -0600)
diff --git a/cygwin/cygwin.c b/cygwin/cygwin.c

index 9419e83..29ee22e 100644 (file)
--- a/cygwin/cygwin.c
+++ b/cygwin/cygwin.c
@@ -176,7 +176,7 @@ utf8_to_wide(const char *buf)
  
      setlocale(LC_CTYPE, "utf-8");
      wbuf = (wchar_t *) safemalloc(wlen);
-    /* utf8_to_uvuni(pathname, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+    /* utf8_to_uvuni_buf(pathname, pathname + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
      wlen = mbsrtowcs(wbuf, (const char**)&buf, wlen, &mbs);
  
      if (oldlocale) setlocale(LC_CTYPE, oldlocale);
@@ -283,7 +283,7 @@ XS(XS_Cygwin_win_to_posix_path)
             mbstate_t mbs;
              char *oldlocale = setlocale(LC_CTYPE, NULL);
              setlocale(LC_CTYPE, "utf-8");
-           /* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+           /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
             wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
             if (wlen > 0)
                 err = cygwin_conv_path(what, wpath, wbuf, wlen);
@@ -370,7 +370,7 @@ XS(XS_Cygwin_posix_to_win_path)
         setlocale(LC_CTYPE, "utf-8");
         if (!IN_BYTES) {
             mbstate_t mbs;
-           /* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
+           /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */
             wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs);
             if (wlen > 0)
                 err = cygwin_conv_path(what, wpath, wbuf, wlen);
diff --git a/dist/Data-Dumper/Dumper.pm b/dist/Data-Dumper/Dumper.pm

index 5cff100..a099277 100644 (file)
--- a/dist/Data-Dumper/Dumper.pm
+++ b/dist/Data-Dumper/Dumper.pm
@@ -10,7 +10,7 @@
  package Data::Dumper;
  
  BEGIN {
-    $VERSION = '2.135_05'; # Don't forget to set version and release
+    $VERSION = '2.135_06'; # Don't forget to set version and release
  }                         # date in POD!
  
  #$| = 1;
@@ -1332,7 +1332,7 @@ modify it under the same terms as Perl itself.
  
  =head1 VERSION
  
-Version 2.135_05  (February 18 2012)
+Version 2.135_06  (March 20 2012)
  
  =head1 SEE ALSO
  
diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs

index 4b7af7c..91e4c6c 100644 (file)
--- a/dist/Data-Dumper/Dumper.xs
+++ b/dist/Data-Dumper/Dumper.xs
@@ -37,17 +37,17 @@ static I32 DD_dump (pTHX_ SV *val, const char *name, STRLEN namelen, SV *retval,
  # endif
  
  UV
-Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
+Perl_utf8_to_uvchr_buf(pTHX_ U8 *s, U8 *send, STRLEN *retlen)
  {
-    const UV uv = utf8_to_uv(s, UTF8_MAXLEN, retlen,
+    const UV uv = utf8_to_uv(s, send - s, retlen,
                      ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
      return UNI_TO_NATIVE(uv);
  }
  
  # if !defined(PERL_IMPLICIT_CONTEXT)
-#  define utf8_to_uvchr             Perl_utf8_to_uvchr
+#  define utf8_to_uvchr_buf         Perl_utf8_to_uvchr_buf
  # else
-#  define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b)
+#  define utf8_to_uvchr_buf(a,b) Perl_utf8_to_uvchr_buf(aTHX_ a,b)
  # endif
  
  #endif /* PERL_VERSION <= 6 */
@@ -147,7 +147,7 @@ esc_q_utf8(pTHX_ SV* sv, register const char *src, register STRLEN slen)
  
      /* this will need EBCDICification */
      for (s = src; s < send; s += increment) {
-        const UV k = utf8_to_uvchr((U8*)s, NULL);
+        const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
  
          /* check for invalid utf8 */
          increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s);
@@ -184,7 +184,7 @@ esc_q_utf8(pTHX_ SV* sv, register const char *src, register STRLEN slen)
          *r++ = '"';
  
          for (s = src; s < send; s += UTF8SKIP(s)) {
-            const UV k = utf8_to_uvchr((U8*)s, NULL);
+            const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL);
  
              if (k == '"' || k == '\\' || k == '$' || k == '@') {
                  *r++ = '\\';
diff --git a/dump.c b/dump.c

index 2c635de..b238ee0 100644 (file)
--- a/dump.c
+++ b/dump.c
@@ -281,7 +281,7 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str,
          isuni = 1;
      
      for ( ; (pv < end && (!max || (wrote < max))) ; pv += readsize ) {
-        const UV u= (isuni) ? utf8_to_uvchr((U8*)pv, &readsize) : (U8)*pv;            
+        const UV u= (isuni) ? utf8_to_uvchr_buf((U8*)pv, (U8*) end, &readsize) : (U8)*pv;
          const U8 c = (U8)u & 0xFF;
          
          if ( ( u > 255 )
@@ -2420,7 +2420,7 @@ Perl_sv_catxmlpvn(pTHX_ SV *dsv, const char *pv, STRLEN len, int utf8)
    retry:
      while (pv < e) {
         if (utf8) {
-           c = utf8_to_uvchr((U8*)pv, &cl);
+           c = utf8_to_uvchr_buf((U8*)pv, (U8*)e, &cl);
             if (cl == 0) {
                 SvCUR(dsv) = dsvcur;
                 pv = start;
diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm

index 7e7e8de..78d77f1 100644 (file)
--- a/ext/XS-APItest/APItest.pm
+++ b/ext/XS-APItest/APItest.pm
@@ -5,7 +5,7 @@ use strict;
  use warnings;
  use Carp;
  
-our $VERSION = '0.36';
+our $VERSION = '0.37';
  
  require XSLoader;
  
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs

index 6e8689c..5105960 100644 (file)
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -148,7 +148,7 @@ bitflip_key(pTHX_ IV action, SV *field) {
                 const char *const end = p + len;
                 while (p < end) {
                     STRLEN len;
-                   UV chr = utf8_to_uvuni((U8 *)p, &len);
+                   UV chr = utf8_to_uvuni_buf((U8 *)p, (U8 *) end, &len);
                     new_p = (char *)uvuni_to_utf8((U8 *)new_p, chr ^ 32);
                     p += len;
                 }
diff --git a/handy.h b/handy.h

index c437447..c90a876 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -949,7 +949,8 @@ EXTCONST U32 PL_charclass[];
                                                                     *((p)+1)))  \
                                             : function(p))
  
-/* Note that all ignore 'use bytes' */
+/* Note that all assume that the utf8 has been validated, and ignore 'use
+ * bytes' */
  
  #define isALNUM_utf8(p)                generic_utf8(isWORDCHAR, is_utf8_alnum, p)
  /* To prevent S_scan_word in toke.c from hanging, we have to make sure that
@@ -992,18 +993,18 @@ EXTCONST U32 PL_charclass[];
                                    : isSPACE_utf8(p)))
  #define isBLANK_utf8(c)                isBLANK(c) /* could be wrong */
  
-#define isALNUM_LC_utf8(p)     isALNUM_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isALPHA_LC_utf8(p)     isALPHA_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isSPACE_LC_utf8(p)     isSPACE_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isUPPER_LC_utf8(p)     isUPPER_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isLOWER_LC_utf8(p)     isLOWER_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isPRINT_LC_utf8(p)     isPRINT_LC_uvchr(utf8_to_uvchr(p,  0))
-#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uvchr(utf8_to_uvchr(p,  0))
+#define isALNUM_LC_utf8(p)     isALNUM_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isALPHA_LC_utf8(p)     isALPHA_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isSPACE_LC_utf8(p)     isSPACE_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isUPPER_LC_utf8(p)     isUPPER_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isLOWER_LC_utf8(p)     isLOWER_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isPRINT_LC_utf8(p)     isPRINT_LC_uvchr(valid_utf8_to_uvchr(p,  0))
+#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uvchr(valid_utf8_to_uvchr(p,  0))
  
  #define isPSXSPC_LC_utf8(c)    (isSPACE_LC_utf8(c) ||(c) == '\f')
  #define isBLANK_LC_utf8(c)     isBLANK(c) /* could be wrong */
diff --git a/pod/perlguts.pod b/pod/perlguts.pod

index ee938ea..908fa1f 100644 (file)
--- a/pod/perlguts.pod
+++ b/pod/perlguts.pod
@@ -2670,17 +2670,18 @@ character like this (the UTF8_IS_INVARIANT() is a macro that tests
  whether the byte can be encoded as a single byte even in UTF-8):
  
      U8 *utf;
+    U8 *utf_end; /* 1 beyond buffer pointed to by utf */
      UV uv;     /* Note: a UV, not a U8, not a char */
      STRLEN len; /* length of character in bytes */
  
      if (!UTF8_IS_INVARIANT(*utf))
          /* Must treat this as UTF-8 */
-        uv = utf8_to_uvchr(utf, &len);
+        uv = utf8_to_uvchr_buf(utf, utf_end, &len);
      else
          /* OK to treat this character as a byte */
          uv = *utf;
  
-You can also see in that example that we use C<utf8_to_uvchr> to get the
+You can also see in that example that we use C<utf8_to_uvchr_buf> to get the
  value of the character; the inverse function C<uvchr_to_utf8> is available
  for putting a UV into UTF-8:
  
@@ -2792,7 +2793,7 @@ it's not - if you pass on the PV to somewhere, pass on the flag too.
  
  =item *
  
-If a string is UTF-8, B<always> use C<utf8_to_uvchr> to get at the value,
+If a string is UTF-8, B<always> use C<utf8_to_uvchr_buf> to get at the value,
  unless C<UTF8_IS_INVARIANT(*s)> in which case you can use C<*s>.
  
  =item *
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index b96efbf..74c1666 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1514,7 +1514,8 @@ pointing after the UTF-8 bytes.  It works appropriately on EBCDIC machines.
  
  =item *
  
-C<utf8_to_uvchr(buf, lenp)> reads UTF-8 encoded bytes from a buffer and
+C<utf8_to_uvchr_buf(buf, bufend, lenp)> reads UTF-8 encoded bytes from a
+buffer and
  returns the Unicode character code point and, optionally, the length of
  the UTF-8 byte sequence.  It works appropriately on EBCDIC machines.
  
diff --git a/pp.c b/pp.c

index f3c4ebb..ba3ac1f 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3383,7 +3383,7 @@ PP(pp_chr)
          sv_recode_to_utf8(TARG, PL_encoding);
         tmps = SvPVX(TARG);
         if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
-           UNICODE_IS_REPLACEMENT(utf8_to_uvchr((U8*)tmps, NULL))) {
+           UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) {
             SvGROW(TARG, 2);
             tmps = SvPVX(TARG);
             SvCUR_set(TARG, 1);
@@ -3795,7 +3795,7 @@ PP(pp_uc)
              uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
                                       cBOOL(IN_LOCALE_RUNTIME), &tainted);
              if (uv == GREEK_CAPITAL_LETTER_IOTA
-                && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
+                && utf8_to_uvchr_buf(s, send, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
              {
                  in_iota_subscript = TRUE;
              }
@@ -5344,7 +5344,7 @@ PP(pp_reverse)
                         continue;
                     }
                     else {
-                       if (!utf8_to_uvchr(s, 0))
+                       if (!utf8_to_uvchr_buf(s, send, 0))
                             break;
                         up = (char*)s;
                         s += UTF8SKIP(s);
diff --git a/regcomp.c b/regcomp.c

index e3da6e9..8c287bf 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3480,8 +3480,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             UV uc;
             if (UTF) {
                 const U8 * const s = (U8*)STRING(scan);
+               uc = utf8_to_uvchr_buf(s, s + l, NULL);
                 l = utf8_length(s, s + l);
-               uc = utf8_to_uvchr(s, NULL);
             } else {
                 uc = *((U8*)STRING(scan));
             }
@@ -3575,8 +3575,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             }
             if (UTF) {
                 const U8 * const s = (U8 *)STRING(scan);
+               uc = utf8_to_uvchr_buf(s, s + l, NULL);
                 l = utf8_length(s, s + l);
-               uc = utf8_to_uvchr(s, NULL);
             }
             else if (has_exactf_sharp_s) {
                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
@@ -9822,7 +9822,10 @@ tryagain:
                               for (foldbuf = tmpbuf;
                                    foldlen;
                                    foldlen -= numlen) {
-                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+
+                                  /* tmpbuf has been constructed by us, so we
+                                   * know it is valid utf8 */
+                                  ender = valid_utf8_to_uvchr(foldbuf, &numlen);
                                    if (numlen > 0) {
                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
                                         s       += unilen;
@@ -9858,7 +9861,7 @@ tryagain:
                           for (foldbuf = tmpbuf;
                                foldlen;
                                foldlen -= numlen) {
-                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              ender = valid_utf8_to_uvchr(foldbuf, &numlen);
                                if (numlen > 0) {
                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
                                     len     += unilen;
diff --git a/symbian/PerlBase.cpp b/symbian/PerlBase.cpp

index 4162e57..9312abe 100644 (file)
--- a/symbian/PerlBase.cpp
+++ b/symbian/PerlBase.cpp
@@ -364,7 +364,9 @@ int CPerlBase::ConsoleRead(const int fd, char* buf, int n)
  #else
      dTHX;
      for (i = 0; i < nUtf8; i+= UTF8SKIP(pUtf8 + i)) {
-        unsigned long u = utf8_to_uvchr((U8*)(pUtf8 + i), 0);
+        unsigned long u = utf8_to_uvchr_buf((U8*)(pUtf8 + i),
+                                            (U8*)(pUtf8 + nUtf8),
+                                            0);
          if (u > 0xFF) {
              iConsole->Printf(_L("(keycode > 0xFF)\n"));
              buf[i] = 0;
@@ -401,7 +403,7 @@ int CPerlBase::ConsoleWrite(const int fd, const char* buf, int n)
      dTHX;
      if (is_utf8_string((U8*)buf, n)) {
          for (int i = 0; i < n; i += UTF8SKIP(buf + i)) {
-            TChar u = utf8_to_uvchr((U8*)(buf + i), 0);
+            TChar u = valid_utf8_to_uvchr((U8*)(buf + i), 0);
              iConsole->Printf(_L("%c"), u);
              wrote++;
          }
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8

index 608f198..f6fa8f2 100644 (file)
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -1,7 +1,7 @@
  
    utf8.c AOK
  
-     [utf8_to_uvchr]
+     [utf8_to_uvchr_buf]
       Malformed UTF-8 character
         my $a = ord "\x80" ;
  
@@ -14,7 +14,7 @@
       <<<<<< Add a test when something actually calls utf16_to_utf8
  
  __END__
-# utf8.c [utf8_to_uvchr] -W
+# utf8.c [utf8_to_uvchr_buf] -W
  BEGIN {
      if (ord('A') == 193) {
          print "SKIPPED\n# ebcdic platforms do not generate Malformed UTF-8 warnings.";
diff --git a/toke.c b/toke.c

index 829ff86..58142ab 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -9883,7 +9883,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
         termlen = 1;
      }
      else {
-       termcode = utf8_to_uvchr((U8*)s, &termlen);
+       termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
         Copy(s, termstr, termlen, U8);
         if (!UTF8_IS_INVARIANT(term))
             has_utf8 = TRUE;
diff --git a/utf8.c b/utf8.c

index c9bc63a..85bf2f0 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -839,7 +839,11 @@ Returns the native code point of the first character in the string C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
-If C<s> does not point to a well-formed UTF-8 character, zero is
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer.
+Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, zero is
  returned and C<retlen> is set, if possible, to -1.
  
  =cut
@@ -850,8 +854,7 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
  
-    return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
-                         ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    return valid_utf8_to_uvchr(s, retlen);
  }
  
  /*
@@ -902,10 +905,11 @@ Returns the Unicode code point of the first character in the string C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
-This function should only be used when the returned UV is considered
-an index into the Unicode semantic tables (e.g. swashes).
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer.
+Use L</utf8_to_uvuni_buf> instead.
  
-If C<s> does not point to a well-formed UTF-8 character, zero is
+If C<s> points to one of the detected malformations, zero is
  returned and C<retlen> is set, if possible, to -1.
  
  =cut
@@ -916,9 +920,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
  
-    /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
-                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    return valid_utf8_to_uvuni(s, retlen);
  }
  
  /*
@@ -1128,7 +1130,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
      d = s = save;
      while (s < send) {
          STRLEN ulen;
-        *d++ = (U8)utf8_to_uvchr(s, &ulen);
+        *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
          s += ulen;
      }
      *d = '\0';
@@ -2154,7 +2156,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
      dVAR;
      U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
      STRLEN len = 0;
-    const UV uv0 = utf8_to_uvchr(p, NULL);
+    const UV uv0 = valid_utf8_to_uvchr(p, NULL);
      /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
       * are necessary in EBCDIC, they are redundant no-ops
       * in ASCII-ish platforms, and hopefully optimized away. */
@@ -2302,7 +2304,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
  bad_crossing:
  
      /* Failed, have to return the original */
-    original = utf8_to_uvchr(p, lenp);
+    original = valid_utf8_to_uvchr(p, lenp);
      Copy(p, ustrp, *lenp, char);
      return original;
  }
@@ -3508,7 +3510,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                            "unexpectedly is not a string, flags=%lu",
                            (unsigned long)SvFLAGS(sv_to));
             }
-           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
+           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
  
             /* Each key in the inverse list is a mapped-to value, and the key's
              * hash value is a list of the strings (each in utf8) that map to
@@ -3575,7 +3577,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
                     }
  
-                   /* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
+                   /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
                     for (j = 0; j <= av_len(from_list); j++) {
                         entryp = av_fetch(from_list, j, FALSE);
                         if (entryp == NULL) {
@@ -3583,9 +3585,11 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                         }
  
                         /* When i==j this adds itself to the list */
-                       av_push(i_list, newSVuv(utf8_to_uvchr(
-                                               (U8*) SvPVX(*entryp), 0)));
-                       /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
+                       av_push(i_list, newSVuv(utf8_to_uvchr_buf(
+                                       (U8*) SvPVX(*entryp),
+                                       (U8*) SvPVX(*entryp) + SvCUR(*entryp),
+                                       0)));
+                       /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
                     }
                 }
             }
@@ -3931,7 +3935,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
             STRLEN char_len;
             if (UTF8_IS_SUPER(s)) {
                 if (ckWARN_d(WARN_NON_UNICODE)) {
-                   UV uv = utf8_to_uvchr(s, &char_len);
+                   UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
                     ok = FALSE;
@@ -3939,7 +3943,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
             }
             else if (UTF8_IS_SURROGATE(s)) {
                 if (ckWARN_d(WARN_SURROGATE)) {
-                   UV uv = utf8_to_uvchr(s, &char_len);
+                   UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
                     ok = FALSE;
@@ -3949,7 +3953,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
                 ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
                  && (ckWARN_d(WARN_NONCHAR)))
             {
-               UV uv = utf8_to_uvchr(s, &char_len);
+               UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                 Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
                     "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
                 ok = FALSE;
@@ -3999,7 +4003,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
               truncated++;
               break;
          }
-        u = utf8_to_uvchr((U8*)s, 0);
+        u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
          if (u < 256) {
              const unsigned char c = (unsigned char)u & 0xFF;
              if (flags & UNI_DISPLAY_BACKSLASH) {
author	Karl Williamson <public@khwilliamson.com>
	Mon, 19 Mar 2012 21:38:06 +0000 (15:38 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Tue, 20 Mar 2012 00:23:44 +0000 (18:23 -0600)
cygwin/cygwin.c		patch \| blob \| history
dist/Data-Dumper/Dumper.pm		patch \| blob \| history
dist/Data-Dumper/Dumper.xs		patch \| blob \| history
dump.c		patch \| blob \| history
ext/XS-APItest/APItest.pm		patch \| blob \| history
ext/XS-APItest/APItest.xs		patch \| blob \| history
handy.h		patch \| blob \| history
pod/perlguts.pod		patch \| blob \| history
pod/perlunicode.pod		patch \| blob \| history
pp.c		patch \| blob \| history
regcomp.c		patch \| blob \| history
symbian/PerlBase.cpp		patch \| blob \| history
t/lib/warnings/utf8		patch \| blob \| history
toke.c		patch \| blob \| history
utf8.c		patch \| blob \| history