Fix EBCDIC bugs in UTF8_ACUMULATE and utf8.c
authorKarl Williamson <public@khwilliamson.com>
Sun, 17 Mar 2013 04:41:15 +0000 (22:41 -0600)
committerKarl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:56:03 +0000 (09:56 -0600)
utf8.c
utf8.h

diff --git a/utf8.c b/utf8.c
index 2a23995..e9b0f5c 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -623,7 +623,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
 
     /* An invariant is trivially well-formed */
     if (UTF8_IS_INVARIANT(uv)) {
-       return uv;
+       return NATIVE_TO_LATIN1(uv);
     }
 
     /* A continuation character can't start a valid sequence */
diff --git a/utf8.h b/utf8.h
index 4fc513b..4037a6a 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -289,9 +289,13 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 
 /* Adds a UTF8 continuation byte 'new' of information to a running total code
  * point 'old' of all the continuation bytes so far.  This is designed to be
- * used in a loop to convert from UTF-8 to the code point represented */
-#define UTF8_ACCUMULATE(old, new)      (((old) << UTF_ACCUMULATION_SHIFT)     \
-                                        | (((U8)new) & UTF_CONTINUATION_MASK))
+ * used in a loop to convert from UTF-8 to the code point represented.  Note
+ * that this is asymmetric on EBCDIC platforms, in that the 'new' parameter is
+ * the UTF-EBCDIC byte, whereas the 'old' parameter is a Unicode (not EBCDIC)
+ * code point in process of being generated */
+#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT)           \
+                                   | ((NATIVE_UTF8_TO_I8((U8)new))             \
+                                       & UTF_CONTINUATION_MASK))
 
 /* This works in the face of malformed UTF-8. */
 #define UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, e) (UTF8_IS_DOWNGRADEABLE_START(*s) \
@@ -314,7 +318,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * downgradable */
 #define TWO_BYTE_UTF8_TO_NATIVE(HI, LO) \
      UNI_TO_NATIVE(UTF8_ACCUMULATE((NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), \
-                                   NATIVE_UTF8_TO_I8(LO)))
+                                   (LO)))
 
 /* Should never be used, and be deprecated */
 #define TWO_BYTE_UTF8_TO_UNI(HI, LO) NATIVE_TO_UNI(TWO_BYTE_UTF8_TO_NATIVE(HI, LO))