Add macro OFFUNISKIP

author Karl Williamson <public@khwilliamson.com>

Tue, 26 Feb 2013 20:35:12 +0000 (13:35 -0700)

committer Karl Williamson <public@khwilliamson.com>

Thu, 29 Aug 2013 15:55:58 +0000 (09:55 -0600)
author Karl Williamson <public@khwilliamson.com>
Tue, 26 Feb 2013 20:35:12 +0000 (13:35 -0700)
committer Karl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:58 +0000 (09:55 -0600)
diff --git a/toke.c b/toke.c

index 73f3590..735e0db 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3772,7 +3772,7 @@ S_scan_const(pTHX_ char *start)
             const UV nextuv   = (this_utf8)
                                  ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                  : (UV) ((U8) *s);
-           const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
+           const STRLEN need = UNISKIP(nextuv);
             if (!has_utf8) {
                 SvCUR_set(sv, d - SvPVX_const(sv));
                 SvPOK_on(sv);
diff --git a/utf8.c b/utf8.c

index 945e31c..cc64ee6 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -184,7 +184,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
      }
  #if defined(EBCDIC)
      else {
-       STRLEN len  = UNISKIP(uv);
+       STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
             *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
@@ -772,7 +772,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
  #endif
  
      if (do_overlong_test
-       && expectlen > (STRLEN)UNISKIP(uv)
+       && expectlen > (STRLEN) OFFUNISKIP(uv)
         && ! (flags & UTF8_ALLOW_LONG))
      {
         /* The overlong malformation has lower precedence than the others.
@@ -780,7 +780,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
          * value, instead of the replacement character.  This is because this
          * value is actually well-defined. */
         if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), *s0));
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
         }
         goto malformed;
      }
diff --git a/utf8.h b/utf8.h

index b3bf997..1ecb3b8 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -231,7 +231,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
             - UTF_ACCUMULATION_SHIFT))
  
  #ifdef HAS_QUAD
-#define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) ( (uv) < 0x80        ? 1 : \
                       (uv) < 0x800          ? 2 : \
                       (uv) < 0x10000        ? 3 : \
                       (uv) < 0x200000       ? 4 : \
@@ -240,7 +241,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
                        (uv) < UTF8_QUAD_MAX ? 7 : 13 )
  #else
  /* No, I'm not even going to *TRY* putting #ifdef inside a #define */
-#define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
+#define OFFUNISKIP(uv) ( (uv) < 0x80        ? 1 : \
                       (uv) < 0x800          ? 2 : \
                       (uv) < 0x10000        ? 3 : \
                       (uv) < 0x200000       ? 4 : \
@@ -297,6 +298,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
                                                 && ( (e) - (s) > 1)             \
                                                 && UTF8_IS_CONTINUATION(*((s)+1)))
  
+/* Number of bytes a code point occupies in UTF-8. */
+#define NATIVE_SKIP(uv) OFFUNISKIP(NATIVE_TO_UNI(uv))
+
+/* Most code which says UNISKIP is really thinking in terms of native code
+ * points (0-255) plus all those beyond.  This is an imprecise term, but having
+ * it means existing code continues to work.  For precision, use NATIVE_SKIP
+ * and OFFUNISKIP */
+#define UNISKIP(uv)   NATIVE_SKIP(uv)
+
  /* Convert a two (not one) byte utf8 character to a native code point value.
   * Needs just one iteration of accumulate.  Should not be used unless it is
   * known that the two bytes are legal: 1) two-byte start, and 2) continuation.
diff --git a/utfebcdic.h b/utfebcdic.h

index 09d59fa..856bcd7 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -701,7 +701,8 @@ END_EXTERN_C
  
   */
  
-#define UNISKIP(uv) ( (uv) < 0xA0           ? 1 : \
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) ( (uv) < 0xA0        ? 1 : \
                       (uv) < 0x400          ? 2 : \
                       (uv) < 0x4000         ? 3 : \
                       (uv) < 0x40000        ? 4 : \
author	Karl Williamson <public@khwilliamson.com>
	Tue, 26 Feb 2013 20:35:12 +0000 (13:35 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 29 Aug 2013 15:55:58 +0000 (09:55 -0600)
toke.c		patch \| blob \| history
utf8.c		patch \| blob \| history
utf8.h		patch \| blob \| history
utfebcdic.h		patch \| blob \| history