Introduce macros for UTF8 decoding.

author Jarkko Hietaniemi <jhi@iki.fi>

Fri, 8 Dec 2000 01:19:08 +0000 (01:19 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Fri, 8 Dec 2000 01:19:08 +0000 (01:19 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Fri, 8 Dec 2000 01:19:08 +0000 (01:19 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Fri, 8 Dec 2000 01:19:08 +0000 (01:19 +0000)
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t

index c631c0a..ac42b85 100644 (file)
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -53,11 +53,11 @@ my @MK = split(/\n/, <<__EOMK__);
  3.1.8 n "\80¿\80¿\80¿\80"      -               7       80:bf:80:bf:80:bf:80    -       unexpected continuation byte 0x80
  3.1.9 n "\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95\96\97\98\99\9a\9b\9c\9d\9e\9f ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿"                             -       64      80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf -       unexpected continuation byte 0x80
  3.2    Lonely start characters
-3.2.1 n "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß "     -       64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 -       unexpected non-continuation byte 0x20 after byte 0xc0
-3.2.2 n "à á â ã ä å æ ç è é ê ë ì í î ï "     -       32      e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 -       unexpected non-continuation byte 0x20 after byte 0xe0
-3.2.3 n "ð ñ ò ó ô õ ö ÷ "     -       16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 -       unexpected non-continuation byte 0x20 after byte 0xf0
-3.2.4 n "ø ù ú û "             -       8       f8:20:f9:20:fa:20:fb:20 -       unexpected non-continuation byte 0x20 after byte 0xf8
-3.2.5 n "ü ý "                 -       4       fc:20:fd:20     -       unexpected non-continuation byte 0x20 after byte 0xfc
+3.2.1 n "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß "     -       64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 -       unexpected non-continuation byte 0x20 after start byte 0xc0
+3.2.2 n "à á â ã ä å æ ç è é ê ë ì í î ï "     -       32      e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 -       unexpected non-continuation byte 0x20 after start byte 0xe0
+3.2.3 n "ð ñ ò ó ô õ ö ÷ "     -       16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 -       unexpected non-continuation byte 0x20 after start byte 0xf0
+3.2.4 n "ø ù ú û "             -       8       f8:20:f9:20:fa:20:fb:20 -       unexpected non-continuation byte 0x20 after start byte 0xf8
+3.2.5 n "ü ý "                 -       4       fc:20:fd:20     -       unexpected non-continuation byte 0x20 after start byte 0xfc
  3.3    Sequences with last continuation byte missing
  3.3.1 n "À"                    -       1       c0      -       1 byte, need 2
  3.3.2 n "à\80"                   -       2       e0:80   -       2 bytes, need 3
@@ -70,7 +70,7 @@ my @MK = split(/\n/, <<__EOMK__);
  3.3.9 n "û¿¿¿"                 -       4       fb:bf:bf:bf     -       4 bytes, need 5
  3.3.10 n "ý¿¿¿¿"               -       5       fd:bf:bf:bf:bf  -       5 bytes, need 6
  3.4    Concatenation of incomplete sequences
-3.4.1 n "Àà\80ð\80\80ø\80\80\80ü\80\80\80\80ßï¿÷¿¿û¿¿¿ý¿¿¿¿"       -       30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected continuation byte 0xe0
+3.4.1 n "Àà\80ð\80\80ø\80\80\80ü\80\80\80\80ßï¿÷¿¿û¿¿¿ý¿¿¿¿"       -       30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected non-continuation byte 0xe0 after start byte 0xc0
  3.5    Impossible bytes
  3.5.1 n "þ"                    -       1       fe      -       byte 0xfe
  3.5.2 n "ÿ"                    -       1       ff      -       byte 0xff
@@ -125,7 +125,7 @@ __EOMK__
  
      local $SIG{__WARN__} =
         sub {
-           # print "# $id: @_";
+           print "# $id: @_";
             $WARNCNT++;
             $WARNMSG = "@_";
         };
diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8

index adc10c6..9a7dbaf 100644 (file)
--- a/t/pragma/warn/utf8
+++ b/t/pragma/warn/utf8
@@ -30,6 +30,6 @@ my $a = "sn
      my $a = "snøstorm";
  }
  EXPECT
-Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 9.
-Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 14.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after start byte 0xf8) at - line 9.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after start byte 0xf8) at - line 14.
  ########
diff --git a/utf8.c b/utf8.c

index 9ef7ce1..98e13e8 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -190,10 +190,10 @@ If C<s> does not point to a well-formed UTF8 character, the behaviour
  is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
  it is assumed that the caller will raise a warning, and this function
  will set C<retlen> to C<-1> and return zero.  If the C<flags> does not
-contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT_CHARACTER (0xFFFD)
-will be returned, and C<retlen> will be set to the expected length of
-the UTF-8 character in bytes.  The C<flags> can also contain various
-flags to allow deviations from the strict UTF-8 encoding (see F<utf8.h>).
+contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT (0xFFFD) will be
+returned, and C<retlen> will be set to the expected length of the
+UTF-8 character in bytes.  The C<flags> can also contain various flags
+to allow deviations from the strict UTF-8 encoding (see F<utf8.h>).
  
  =cut */
  
@@ -216,13 +216,13 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
         goto malformed;
      }
  
-    if (uv <= 0x7f) { /* Pure ASCII. */
+    if (UTF8_IS_ASCII(uv)) {
         if (retlen)
             *retlen = 1;
         return *s;
      }
  
-    if ((uv >= 0x80 && uv <= 0xbf) &&
+    if (UTF8_IS_CONTINUATION(uv) &&
         !(flags & UTF8_ALLOW_CONTINUATION)) {
         if (dowarn)
             Perl_warner(aTHX_ WARN_UTF8,
@@ -231,11 +231,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
         goto malformed;
      }
  
-    if ((uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) &&
+    if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
         if (dowarn)
             Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after byte 0x%02"UVxf")",
+                       "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
                         (UV)s[1], uv);
         goto malformed;
      }
@@ -276,10 +276,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
      ouv = uv;
  
      while (len--) {
-       if ((*s & 0xc0) != 0x80) {
+       if (!UTF8_IS_CONTINUATION(*s) &&
+           !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
             if (dowarn)
                 Perl_warner(aTHX_ WARN_UTF8,
-                           "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+                           "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)",
                             *s);
             goto malformed;
         }
@@ -297,14 +298,14 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
         ouv = uv;
      }
  
-    if ((uv >= 0xd800 && uv <= 0xdfff) &&
+    if (UNICODE_IS_SURROGATE(uv) &&
         !(flags & UTF8_ALLOW_SURROGATE)) {
         if (dowarn)
             Perl_warner(aTHX_ WARN_UTF8,
                         "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
                         uv);
         goto malformed;
-    } else if ((uv == 0xfffe) &&
+    } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
                !(flags & UTF8_ALLOW_BOM)) {
         if (dowarn)
             Perl_warner(aTHX_ WARN_UTF8,
@@ -318,7 +319,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
                         "Malformed UTF-8 character (%d byte%s, need %d)",
                         expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
         goto malformed;
-    } else if ((uv == 0xffff) &&
+    } else if (UNICODE_IS_ILLEGAL(uv) &&
                !(flags & UTF8_ALLOW_FFFF)) {
         if (dowarn)
             Perl_warner(aTHX_ WARN_UTF8,
@@ -340,7 +341,7 @@ malformed:
      if (retlen)
         *retlen = expectlen;
  
-    return UNICODE_REPLACEMENT_CHARACTER;
+    return UNICODE_REPLACEMENT;
  }
  
  /*
diff --git a/utf8.h b/utf8.h

index 25ddc14..bafdc57 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -46,10 +46,26 @@ END_EXTERN_C
  #define UTF8_ALLOW_ANY                 0x00ff
  #define UTF8_CHECK_ONLY                        0x0100
  
+#define UNICODE_SURROGATE_FIRST                0xd800
+#define UNICODE_SURROGATE_LAST         0xdfff
+#define UNICODE_REPLACEMENT            0xfffd
+#define UNICODE_BYTER_ORDER_MARK       0xfffe
+#define UNICODE_ILLEGAL                        0xffff
+
+#define UNICODE_IS_SURROGATE(c)                ((c) >= UNICODE_SURROGATE_FIRST && \
+                                        (c) <= UNICODE_SURROGATE_LAST)
+#define UNICODE_IS_REPLACEMENT(c)      ((c) == UNICODE_REPLACMENT)
+#define UNICODE_IS_BYTE_ORDER_MARK(c)  ((c) == UNICODE_BYTER_ORDER_MARK)
+#define UNICODE_IS_ILLEGAL(c)          ((c) == UNICODE_ILLEGAL)
+
  #define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
  
  #define UTF8_QUAD_MAX  UINT64_C(0x1000000000)
  
+#define UTF8_IS_ASCII(c)               ((c) <  0x80)
+#define UTF8_IS_START(c)               ((c) >= 0xc0 && ((c) <= 0xfd))
+#define UTF8_IS_CONTINUATION(c)                ((c) >= 0x80 && ((c) <= 0xbf))
+
  #ifdef HAS_QUAD
  #define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
                       (uv) < 0x800          ? 2 : \
@@ -68,7 +84,6 @@ END_EXTERN_C
                       (uv) < 0x80000000     ? 6 : 7 )
  #endif
  
-#define UNICODE_REPLACEMENT_CHARACTER  0xfffd
  
  /*
   * Note: we try to be careful never to call the isXXX_utf8() functions
author	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 8 Dec 2000 01:19:08 +0000 (01:19 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 8 Dec 2000 01:19:08 +0000 (01:19 +0000)
t/op/utf8decode.t		patch \| blob \| history
t/pragma/warn/utf8		patch \| blob \| history
utf8.c		patch \| blob \| history
utf8.h		patch \| blob \| history